library(tidyverse) # for everything
library(readxl) # for reading in excel files
library(janitor) # data checks and cleaning
library(glue) # for easy pasting
library(FactoMineR) # for PCA
library(factoextra) # for PCA
library(rstatix) # for stats
library(pheatmap) # for heatmaps
library(plotly) # for interactive plots
library(htmlwidgets) # for saving interactive plots
library(devtools)
library(notame) # used for feature clustering
library(doParallel)
library(igraph) # feature clustering
library(ggpubr) # visualizations
library(knitr) # clean table printing
library(mixOmics) # for multilevel PCAs# raw filtered metabolomics data in C18 (+)
omicsdata <- read_csv("Feature lists/C18Pos-Filtered-Data-05Jun23_946features.csv")
# metadata
metadata <- read_excel("Metadata-urine-c18pos.xlsx")metadata <- metadata %>%
rename("sample_ID" = Sample_ID)# rename "row ID"
omicsdata <- omicsdata %>%
rename("row_ID" = `row ID`)
# how many features
nrow(omicsdata)## [1] 946
# are there any duplicates?
omicsdata %>% get_dupes(mz_rt)## # A tibble: 2 × 60
## mz_rt dupe_count row_ID `6101_U4_C18POS_14` `6110_U2_C18POS_34`
## <chr> <int> <dbl> <dbl> <dbl>
## 1 369.152_4.502 2 4830 40321. 3464.
## 2 369.152_4.502 2 4832 40321. 3464.
## # ℹ 55 more variables: `6104_U2_C18POS_19` <dbl>, `6109_U4_C18POS_22` <dbl>,
## # `6111_U1_C18POS_51` <dbl>, `6110_U3_C18POS_45` <dbl>,
## # `6105_U1_C18POS_43` <dbl>, `6111_U4_C18POS_42` <dbl>,
## # `6113_U2_C18POS_31` <dbl>, `6105_U2_C18POS_40` <dbl>,
## # `6109_U1_C18POS_58` <dbl>, `6113_U4_C18POS_62` <dbl>,
## # `6102_U1_C18POS_26_1` <dbl>, `6102_U2_C18POS_16` <dbl>,
## # `6105_U4_C18POS_47` <dbl>, `6105_U3_C18POS_39` <dbl>, …
Remove duplicates
# remove dupes
omicsdata <- omicsdata %>%
distinct(mz_rt, .keep_all = TRUE)
# check again for dupes
omicsdata %>% get_dupes(mz_rt)## # A tibble: 0 × 60
## # ℹ 60 variables: mz_rt <chr>, dupe_count <int>, row_ID <dbl>,
## # 6101_U4_C18POS_14 <dbl>, 6110_U2_C18POS_34 <dbl>, 6104_U2_C18POS_19 <dbl>,
## # 6109_U4_C18POS_22 <dbl>, 6111_U1_C18POS_51 <dbl>, 6110_U3_C18POS_45 <dbl>,
## # 6105_U1_C18POS_43 <dbl>, 6111_U4_C18POS_42 <dbl>, 6113_U2_C18POS_31 <dbl>,
## # 6105_U2_C18POS_40 <dbl>, 6109_U1_C18POS_58 <dbl>, 6113_U4_C18POS_62 <dbl>,
## # 6102_U1_C18POS_26_1 <dbl>, 6102_U2_C18POS_16 <dbl>,
## # 6105_U4_C18POS_47 <dbl>, 6105_U3_C18POS_39 <dbl>, …
# how many features
nrow(omicsdata)## [1] 945
Remove weird empty column
colnames(omicsdata)## [1] "mz_rt"
## [2] "row_ID"
## [3] "6101_U4_C18POS_14"
## [4] "6110_U2_C18POS_34"
## [5] "6104_U2_C18POS_19"
## [6] "6109_U4_C18POS_22"
## [7] "6111_U1_C18POS_51"
## [8] "6110_U3_C18POS_45"
## [9] "6105_U1_C18POS_43"
## [10] "6111_U4_C18POS_42"
## [11] "6113_U2_C18POS_31"
## [12] "6105_U2_C18POS_40"
## [13] "6109_U1_C18POS_58"
## [14] "6113_U4_C18POS_62"
## [15] "6102_U1_C18POS_26_1"
## [16] "6102_U2_C18POS_16"
## [17] "6105_U4_C18POS_47"
## [18] "6105_U3_C18POS_39"
## [19] "6111_U2_C18POS_35"
## [20] "6103_U4_C18POS_53"
## [21] "6102_U3_C18POS_48"
## [22] "6102_U4_C18POS_50"
## [23] "6104_U4_C18POS_12"
## [24] "6101_U1_C18POS_59"
## [25] "6103_U3_C18POS_61"
## [26] "6108_U2_C18POS_27_1"
## [27] "6103_U2_C18POS_60"
## [28] "6101_U2_C18POS_30"
## [29] "6112_U2_C18POS_37"
## [30] "6113_U1_C18POS_24"
## [31] "6108_U4_C18POS_18"
## [32] "6104_U1_C18POS_55"
## [33] "6113_U3_C18POS_15"
## [34] "6101_U3_C18POS_29_1"
## [35] "6106_U3_C18POS_20"
## [36] "6106_U2_C18POS_46"
## [37] "6108_U1_C18POS_44"
## [38] "6111_U3_C18POS_54"
## [39] "6112_U4_C18POS_63"
## [40] "6109_U3_C18POS_52"
## [41] "6110_U1_C18POS_11"
## [42] "6110_U4_C18POS_start-10"
## [43] "6112_U3_C18POS_56"
## [44] "6108_U3_C18POS_28_1"
## [45] "6103_U1_C18POS_21"
## [46] "6106_U1_C18POS_23"
## [47] "6109_U2_C18POS_13"
## [48] "6106_U4_C18POS_36"
## [49] "6112_U1_C18POS_38"
## [50] "PooledQC6_C18POS_17"
## [51] "PooledQC11_C18POS_57"
## [52] "PooledQC9_C18POS_41"
## [53] "6104_U3_C18POS_32"
## [54] "PooledQC10_C18POS_49"
## [55] "PooledQC12_C18POS_64"
## [56] "PrerunPooledQC5tryagain-addednew3_C18POS_14_1"
## [57] "PooledQC8_C18POS_33"
## [58] "PooledQC7_C18POS_25"
## [59] "...59"
# remove weird lgl column
omicsdata <- omicsdata %>%
dplyr::select(!where(is.logical))
colnames(omicsdata)## [1] "mz_rt"
## [2] "row_ID"
## [3] "6101_U4_C18POS_14"
## [4] "6110_U2_C18POS_34"
## [5] "6104_U2_C18POS_19"
## [6] "6109_U4_C18POS_22"
## [7] "6111_U1_C18POS_51"
## [8] "6110_U3_C18POS_45"
## [9] "6105_U1_C18POS_43"
## [10] "6111_U4_C18POS_42"
## [11] "6113_U2_C18POS_31"
## [12] "6105_U2_C18POS_40"
## [13] "6109_U1_C18POS_58"
## [14] "6113_U4_C18POS_62"
## [15] "6102_U1_C18POS_26_1"
## [16] "6102_U2_C18POS_16"
## [17] "6105_U4_C18POS_47"
## [18] "6105_U3_C18POS_39"
## [19] "6111_U2_C18POS_35"
## [20] "6103_U4_C18POS_53"
## [21] "6102_U3_C18POS_48"
## [22] "6102_U4_C18POS_50"
## [23] "6104_U4_C18POS_12"
## [24] "6101_U1_C18POS_59"
## [25] "6103_U3_C18POS_61"
## [26] "6108_U2_C18POS_27_1"
## [27] "6103_U2_C18POS_60"
## [28] "6101_U2_C18POS_30"
## [29] "6112_U2_C18POS_37"
## [30] "6113_U1_C18POS_24"
## [31] "6108_U4_C18POS_18"
## [32] "6104_U1_C18POS_55"
## [33] "6113_U3_C18POS_15"
## [34] "6101_U3_C18POS_29_1"
## [35] "6106_U3_C18POS_20"
## [36] "6106_U2_C18POS_46"
## [37] "6108_U1_C18POS_44"
## [38] "6111_U3_C18POS_54"
## [39] "6112_U4_C18POS_63"
## [40] "6109_U3_C18POS_52"
## [41] "6110_U1_C18POS_11"
## [42] "6110_U4_C18POS_start-10"
## [43] "6112_U3_C18POS_56"
## [44] "6108_U3_C18POS_28_1"
## [45] "6103_U1_C18POS_21"
## [46] "6106_U1_C18POS_23"
## [47] "6109_U2_C18POS_13"
## [48] "6106_U4_C18POS_36"
## [49] "6112_U1_C18POS_38"
## [50] "PooledQC6_C18POS_17"
## [51] "PooledQC11_C18POS_57"
## [52] "PooledQC9_C18POS_41"
## [53] "6104_U3_C18POS_32"
## [54] "PooledQC10_C18POS_49"
## [55] "PooledQC12_C18POS_64"
## [56] "PrerunPooledQC5tryagain-addednew3_C18POS_14_1"
## [57] "PooledQC8_C18POS_33"
## [58] "PooledQC7_C18POS_25"
# create long df for omics df
omicsdata_tidy <- omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and omics dfs
df_combined <- full_join(omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
df_combined_sep <- df_combined %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
df_combined_sep$mz <- as.numeric(df_combined_sep$mz)
df_combined_sep$rt <- as.numeric(df_combined_sep$rt)
df_combined_sep$Subject <- as.character(df_combined_sep$Subject)
df_combined_sep$Intervention <- as.character(df_combined_sep$Intervention)
# rearrange column order
df_combined_sep <- df_combined_sep %>%
dplyr::select(sample_ID, pre_post, Intervention, everything())
str(df_combined_sep)## tibble [52,920 × 14] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:52920] "6101_U4_C18POS_14" "6110_U2_C18POS_34" "6104_U2_C18POS_19" "6109_U4_C18POS_22" ...
## $ pre_post : chr [1:52920] "post" "post" "post" "post" ...
## $ Intervention : chr [1:52920] "Yellow" "Yellow" "Yellow" "Red" ...
## $ mz : num [1:52920] 227 227 227 227 227 ...
## $ rt : num [1:52920] 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 0.553 ...
## $ row_ID : num [1:52920] 37 37 37 37 37 37 37 37 37 37 ...
## $ peak_height : num [1:52920] 82834 159688 140461 47134 83790 ...
## $ Subject : chr [1:52920] "6101" "6110" "6104" "6109" ...
## $ Period : chr [1:52920] "U4" "U2" "U2" "U4" ...
## $ sequence : chr [1:52920] "R_Y" "Y_R" "Y_R" "Y_R" ...
## $ Intervention_week: num [1:52920] 14 6 6 14 2 10 2 14 6 6 ...
## $ Sex : chr [1:52920] "F" "M" "M" "F" ...
## $ Age : num [1:52920] 58 36 54 75 46 36 40 46 61 40 ...
## $ BMI : num [1:52920] 31.1 29.9 33.1 43.3 30 ...
# replace NA's in subject and intervention columns with QC
df_combined_sep$Subject <- df_combined_sep$Subject %>%
replace_na("QC")
df_combined_sep$Intervention <- df_combined_sep$Intervention %>%
replace_na("QC")nrow(omicsdata)## [1] 945
range(df_combined_sep$mz)## [1] 61.0395 1106.5217
range(df_combined_sep$rt)## [1] 0.553 10.707
# plot
(plot_mzvsrt <- df_combined_sep %>%
ggplot(aes(x = rt, y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features"))df_combined_sep %>%
ggplot(aes(x = mz)) +
geom_histogram(binwidth = 25) +
theme_minimal() +
labs(x = "Monoisotopic mass (amu)",
y = "Number of features",
title = "Distribution of features by mass")df_combined_sep %>%
ggplot(aes(x = rt)) +
geom_histogram(binwidth = 0.1) + # 6 second bins
theme_minimal() +
labs(x = "Retention time",
y = "Number of features",
title = "Distribution of features by retention time")# NAs in all data including QCs
NAbyRow <- rowSums(is.na(omicsdata[,-1]))
hist(NAbyRow,
breaks = 56, # because there are 56 samples, 48 samples + 8 QCs
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")# samples only (no QCs)
omicsdata_noQC <- omicsdata %>%
dplyr::select(-contains("QC"))
#NAs in samples only?
NAbyRow_noQC <- rowSums(is.na(omicsdata_noQC[,-1]))
hist(NAbyRow_noQC,
breaks = 48, # because there are 48 samples
xlab = "Number of missing values",
ylab = "Number of metabolites",
main = "How many missing values are there?")Are there any missing values in QCs? There shouldn’t be after data preprocessing/filtering
omicsdata_QC <- omicsdata %>%
dplyr::select(starts_with("P"))
NAbyRow_QC <- colSums(is.na(omicsdata_QC))
# lets confirm that there are no missing values from my QCs
sum(NAbyRow_QC) # no## [1] 0
# calculate how many NAs there are per feature in whole data set
contains_NAs <- df_combined %>%
group_by(mz_rt) %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(contains_NAs)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 44
## 2 106.0133_6.888 TRUE 36
## 3 1071.4352_3.059 TRUE 44
## 4 1106.5217_3.833 TRUE 44
## 5 119.0485_0.793 TRUE 36
## 6 119.0492_3.019 TRUE 21
NAs by groups
#calculate NAs per feature in red intervention
NAs_Red_Intervention <- df_combined %>%
group_by(mz_rt) %>%
filter(Intervention == "Red") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_Red_Intervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 15
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 19
## 6 119.0492_3.019 TRUE 12
#calculate NAs per feature in yellow intervention
NAs_Yellow_Intervention <- df_combined %>%
group_by(mz_rt) %>%
filter(Intervention == "Yellow") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_Yellow_Intervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 21
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 17
## 6 119.0492_3.019 TRUE 9
#calculate NAs per feature in before both interventions
NAs_preIntervention <- df_combined %>%
group_by(mz_rt) %>%
filter(pre_post == "pre") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_preIntervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 17
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 18
## 6 119.0492_3.019 TRUE 10
#calculate NAs per feature after both interventions
NAs_postIntervention <- df_combined %>%
group_by(mz_rt) %>%
filter(pre_post == "post") %>%
count(is.na(peak_height)) %>%
filter(`is.na(peak_height)` == TRUE)
head(NAs_postIntervention)## # A tibble: 6 × 3
## # Groups: mz_rt [6]
## mz_rt `is.na(peak_height)` n
## <chr> <lgl> <int>
## 1 1056.4441_3.056 TRUE 22
## 2 106.0133_6.888 TRUE 19
## 3 1071.4352_3.059 TRUE 22
## 4 1106.5217_3.833 TRUE 22
## 5 119.0485_0.793 TRUE 18
## 6 119.0492_3.019 TRUE 11
To try and handle outliers, I came up with filtering out metabolites that are only present in one person. i.e. remove metabolites that are missing from at least 44 samples. I am taking this bit out for now as it did not change anything
# remove features that have 44 or more NAs
omit_features <- contains_NAs %>%
filter(n >= 44)
#preview
nrow(omit_features) # features to remove
# how many features to remove?
nrow(omicsdata) - nrow(omit_features)
# now remove these features from the omics dataset
omicsdata <- omicsdata %>%
anti_join(omit_features,
by = "mz_rt")
# how many features are there now?
nrow(omicsdata)# impute any missing values by replacing them with 1/2 of the lowest peak height value of a feature (i.e. in a row).
imputed_omicsdata <- omicsdata
imputed_omicsdata[] <- lapply(imputed_omicsdata,
function(x) ifelse(is.na(x),
min(x, na.rm = TRUE)/2, x))
dim(imputed_omicsdata)## [1] 945 58
Are there any NAs?
imputed_omicsdata %>%
is.na() %>%
sum()## [1] 0
# imputations worked# create long df for imputed omics df
imputed_omicsdata_tidy <- imputed_omicsdata %>%
pivot_longer(cols = 3:ncol(.),
names_to = "sample_ID",
values_to = "peak_height")
# combine meta and imputed omics dfs
imputed_fulldata <- full_join(imputed_omicsdata_tidy,
metadata,
by = c("sample_ID" = "sample_ID"))
# separate mz and rt
imputed_fulldata_sep <- imputed_fulldata %>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
# convert columns to correct type
imputed_fulldata_sep$mz <- as.numeric(imputed_fulldata_sep$mz)
imputed_fulldata_sep$rt <- as.numeric(imputed_fulldata_sep$rt)
imputed_fulldata_sep$Subject <- as.character(imputed_fulldata_sep$Subject)
imputed_fulldata_sep$Intervention <- as.character(imputed_fulldata_sep$Intervention)# rt vs mz plot
imputed_fulldata_sep %>%
ggplot(aes(x = rt, y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "RT (min)",
y = "mz")
# Notame feature reduction vignette for reference
#browseVignettes("notame")# create features list from imputed data set to only include unique feature ID's (mz_rt), mz and RT
features <- imputed_fulldata_sep %>%
cbind(imputed_fulldata$mz_rt) %>%
rename("mz_rt" = "imputed_fulldata$mz_rt") %>%
dplyr::select(c(mz_rt, mz, rt)) %>%
distinct() # remove the duplicate rows
# create a second data frame which is just imputed_fulldata restructured to another wide format
data_notame <- data.frame(imputed_omicsdata %>%
dplyr::select(-row_ID) %>%
t())
data_notame <- data_notame %>%
tibble::rownames_to_column() %>% # change samples from rownames to its own column
row_to_names(row_number = 1) # change the feature IDs (mz_rt) from first row obs into column namesCheck structures
# check if mz and rt are numeric
str(features)## 'data.frame': 945 obs. of 3 variables:
## $ mz_rt: chr "226.9516_0.553" "159.1492_0.608" "175.1442_0.616" "189.1684_0.616" ...
## $ mz : num 227 159 175 189 189 ...
## $ rt : num 0.553 0.608 0.616 0.616 0.615 0.621 0.62 0.633 0.635 0.636 ...
tibble(features)## # A tibble: 945 × 3
## mz_rt mz rt
## <chr> <dbl> <dbl>
## 1 226.9516_0.553 227. 0.553
## 2 159.1492_0.608 159. 0.608
## 3 175.1442_0.616 175. 0.616
## 4 189.1684_0.616 189. 0.616
## 5 189.16_0.615 189. 0.615
## 6 156.0769_0.621 156. 0.621
## 7 170.0926_0.62 170. 0.62
## 8 136.0482_0.633 136. 0.633
## 9 151.1443_0.635 151. 0.635
## 10 137.071_0.636 137. 0.636
## # ℹ 935 more rows
# check if results are numeric
tibble(data_notame)## # A tibble: 56 × 946
## mz_rt `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <chr> <chr> <chr> <chr>
## 1 6101_U4_… " 82834.1800" " 40852.7970" " 13575.6550" " 19074.8930"
## 2 6110_U2_… " 159688.0300" " 17948.1500" " 21984.4790" " 17190.7050"
## 3 6104_U2_… " 140460.9000" " 32255.4550" " 14964.8470" " 20831.3890"
## 4 6109_U4_… " 47134.4200" " 63559.5400" " 52516.5600" " 24691.2460"
## 5 6111_U1_… " 83789.7700" " 131795.4400" " 47572.5400" " 31355.4630"
## 6 6110_U3_… " 115715.8700" " 71032.5500" " 14294.8420" " 27822.7420"
## 7 6105_U1_… " 141117.8600" " 72057.3500" " 44426.5080" " 28435.2440"
## 8 6111_U4_… " 103462.300" " 120798.030" " 50446.316" " 33316.652"
## 9 6113_U2_… " 121278.8000" " 92756.7900" " 37672.3800" " 34728.8440"
## 10 6105_U2_… " 92647.2340" " 98266.6800" " 55319.7970" " 26269.3400"
## # ℹ 46 more rows
## # ℹ 941 more variables: `189.16_0.615` <chr>, `156.0769_0.621` <chr>,
## # `170.0926_0.62` <chr>, `136.0482_0.633` <chr>, `151.1443_0.635` <chr>,
## # `137.071_0.636` <chr>, `182.0574_0.654` <chr>, `162.1126_0.642` <chr>,
## # `76.0757_0.642` <chr>, `114.0669_0.645` <chr>, `227.1255_0.647` <chr>,
## # `193.1547_0.646` <chr>, `219.1705_0.65` <chr>, `163.1243_0.654` <chr>,
## # `213.1234_0.672` <chr>, `203.1502_0.654` <chr>, `138.0551_0.654` <chr>, …
# change to results to numeric
data_notame <- data_notame %>%
mutate_at(-1, as.numeric)
tibble(data_notame)## # A tibble: 56 × 946
## mz_rt `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6101_U4_… 82834. 40853. 13576. 19075.
## 2 6110_U2_… 159688. 17948. 21984. 17191.
## 3 6104_U2_… 140461. 32255. 14965. 20831.
## 4 6109_U4_… 47134. 63560. 52517. 24691.
## 5 6111_U1_… 83790. 131795. 47573. 31355.
## 6 6110_U3_… 115716. 71033. 14295. 27823.
## 7 6105_U1_… 141118. 72057. 44427. 28435.
## 8 6111_U4_… 103462. 120798. 50446. 33317.
## 9 6113_U2_… 121279. 92757. 37672. 34729.
## 10 6105_U2_… 92647. 98267. 55320. 26269.
## # ℹ 46 more rows
## # ℹ 941 more variables: `189.16_0.615` <dbl>, `156.0769_0.621` <dbl>,
## # `170.0926_0.62` <dbl>, `136.0482_0.633` <dbl>, `151.1443_0.635` <dbl>,
## # `137.071_0.636` <dbl>, `182.0574_0.654` <dbl>, `162.1126_0.642` <dbl>,
## # `76.0757_0.642` <dbl>, `114.0669_0.645` <dbl>, `227.1255_0.647` <dbl>,
## # `193.1547_0.646` <dbl>, `219.1705_0.65` <dbl>, `163.1243_0.654` <dbl>,
## # `213.1234_0.672` <dbl>, `203.1502_0.654` <dbl>, `138.0551_0.654` <dbl>, …
connection <- find_connections(data = data_notame,
features = features,
corr_thresh = 0.9,
rt_window = 1/60,
name_col = "mz_rt",
mz_col = "mz",
rt_col = "rt")## [1] 100
## [1] 200
## [1] 300
## [1] 400
## [1] 500
## [1] 600
## [1] 700
## [1] 800
## [1] 900
head(connection)## x y cor rt_diff mz_diff
## 1 151.1443_0.635 76.0757_0.642 0.9772910 0.007 -75.0686
## 2 182.0574_0.654 287.1967_0.655 0.9865474 0.001 105.1393
## 3 114.0669_0.645 227.1255_0.647 0.9689492 0.002 113.0586
## 4 219.1705_0.65 145.1054_0.656 0.9099522 0.006 -74.0651
## 5 144.1023_0.656 145.1054_0.656 0.9856422 0.000 1.0031
## 6 343.1668_0.698 258.1241_0.69 0.9555601 -0.008 -85.0427
clusters <- find_clusters(connections = connection, d_thresh = 0.8)## 113 components found
##
## Component 100 / 113
## 37 components found
##
## 12 components found
##
## 9 components found
##
## 2 components found
##
## 1 components found
# assign a cluster ID to all features. Clusters are named after feature with highest median peak height
features_clustered <- assign_cluster_id(data_notame, clusters, features, name_col = "mz_rt")
# export clustered feature list
write_csv(features_clustered,
"features_notame-clusters_c18-pos.csv")
# visualize clusters
#visualize_clusters(data_notame, features, clusters, min_size = 3, rt_window = 2,name_col = "mz_rt", mz_col = "mz", rt_col = "rt", file_path = "~/path/to/project/")
# lets see how many features are removed when we only keep one feature per cluster
pulled <- pull_clusters(data_notame, features_clustered, name_col = "mz_rt")
cluster_data <- pulled$cdata
cluster_features <- pulled$cfeatures
nrow(omicsdata) - nrow(cluster_features)## [1] 317
# transpose the full dataset back to wide so that it is more similar to the notame dataset
imputed_fulldata_wide <- imputed_fulldata %>%
dplyr::select(-"row_ID") %>%
pivot_wider(names_from = mz_rt,
values_from = peak_height)
# list of reduced features
clusternames <- cluster_features$mz_rt
#dplyr:: only the features are in the reduced list
imp_clust <- imputed_fulldata_wide[,c(names(imputed_fulldata_wide) %in% clusternames)]
# bind back sample names
imp_clust <- cbind(imputed_fulldata_wide[1], imp_clust)
tibble(imp_clust)## # A tibble: 56 × 629
## sample_ID `226.9516_0.553` `159.1492_0.608` `175.1442_0.616` `189.1684_0.616`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6101_U4_… 82834. 40853. 13576. 19075.
## 2 6110_U2_… 159688. 17948. 21984. 17191.
## 3 6104_U2_… 140461. 32255. 14965. 20831.
## 4 6109_U4_… 47134. 63560. 52517. 24691.
## 5 6111_U1_… 83790. 131795. 47573. 31355.
## 6 6110_U3_… 115716. 71033. 14295. 27823.
## 7 6105_U1_… 141118. 72057. 44427. 28435.
## 8 6111_U4_… 103462. 120798. 50446. 33317.
## 9 6113_U2_… 121279. 92757. 37672. 34729.
## 10 6105_U2_… 92647. 98267. 55320. 26269.
## # ℹ 46 more rows
## # ℹ 624 more variables: `189.16_0.615` <dbl>, `156.0769_0.621` <dbl>,
## # `170.0926_0.62` <dbl>, `136.0482_0.633` <dbl>, `137.071_0.636` <dbl>,
## # `162.1126_0.642` <dbl>, `76.0757_0.642` <dbl>, `114.0669_0.645` <dbl>,
## # `193.1547_0.646` <dbl>, `219.1705_0.65` <dbl>, `163.1243_0.654` <dbl>,
## # `213.1234_0.672` <dbl>, `203.1502_0.654` <dbl>, `138.0551_0.654` <dbl>,
## # `146.0812_0.71` <dbl>, `141.0659_0.655` <dbl>, `138.0639_0.655` <dbl>, …
# plot new rt vs mz scatterplot post-clustering
(plot_mzvsrt_postcluster <- cluster_features %>%
ggplot(aes(x = rt,
y = mz)) +
geom_point() +
theme_minimal() +
labs(x = "Retention time, min",
y = "m/z, neutral",
title = "mz across RT for all features after clustering"))# plot both scatterplots to compare with and without notame clustering
(scatterplots <- ggarrange(plot_mzvsrt,
plot_mzvsrt_postcluster,
nrow = 2))imp_metabind_clust <- right_join(metadata,
imp_clust,
by = "sample_ID")# change meta data columns to character so that I can change NAs from QCs to "QC"
imp_metabind_clust <- imp_metabind_clust %>%
mutate_at(c("Subject",
"Period",
"Intervention",
"pre_post",
"sequence",
"Intervention_week",
"Sex",
"Age",
"BMI"),
as.character)
# replace NAs in metadata columns for QCs
imp_metabind_clust[is.na(imp_metabind_clust)] <- "QC"
# unite pre_post column with intervention column to create pre_intervention column
imp_metabind_clust <- imp_metabind_clust %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)
# long df
imp_metabind_clust_tidy <- imp_metabind_clust %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund")
# structure check
str(imp_metabind_clust_tidy)## tibble [35,168 × 13] (S3: tbl_df/tbl/data.frame)
## $ sample_ID : chr [1:35168] "6101_U1_C18POS_59" "6101_U1_C18POS_59" "6101_U1_C18POS_59" "6101_U1_C18POS_59" ...
## $ Subject : chr [1:35168] "6101" "6101" "6101" "6101" ...
## $ Period : chr [1:35168] "U1" "U1" "U1" "U1" ...
## $ pre_post_intervention: chr [1:35168] "pre_Red" "pre_Red" "pre_Red" "pre_Red" ...
## $ Intervention : chr [1:35168] "Red" "Red" "Red" "Red" ...
## $ pre_post : chr [1:35168] "pre" "pre" "pre" "pre" ...
## $ sequence : chr [1:35168] "R_Y" "R_Y" "R_Y" "R_Y" ...
## $ Intervention_week : chr [1:35168] "2" "2" "2" "2" ...
## $ Sex : chr [1:35168] "F" "F" "F" "F" ...
## $ Age : chr [1:35168] "58" "58" "58" "58" ...
## $ BMI : chr [1:35168] "31.0556029483653" "31.0556029483653" "31.0556029483653" "31.0556029483653" ...
## $ mz_rt : chr [1:35168] "226.9516_0.553" "159.1492_0.608" "175.1442_0.616" "189.1684_0.616" ...
## $ rel_abund : num [1:35168] 108697 81884 16201 28592 407452 ...
imp_metabind_clust_tidy %>%
ggplot(aes(x = sample_ID, y = rel_abund, color = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_color_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Unscaled data",
y = "Relative abundance")
Will need to log transform in order to normalize and actually see the
data
imp_metabind_clust_tidy_log2 <- imp_metabind_clust_tidy %>%
mutate(rel_abund_log2 = if_else(rel_abund > 0, log2(rel_abund), 0)) %>%
replace(is.na(.), 0)(bp_data_quality <- imp_metabind_clust_tidy_log2 %>%
ggplot(aes(x = sample_ID, y = rel_abund_log2, fill = Intervention)) +
geom_boxplot(alpha = 0.6) +
scale_fill_manual(values = c("light grey", "tomato1", "gold")) +
theme_minimal() +
labs(title = "LC-MS (+) Feature Abundances by Sample",
subtitle = "Log2 transformed data",
y = "Relative abundance"))# filtered and imputed data after notame clustering, transposed
features_testforQCcorr <- t(imp_clust) %>%
as.data.frame() %>%
row_to_names(row_number = "find_header")
# log2 transform
log2_features_testforQCcorr <- features_testforQCcorr %>%
mutate_all(as.numeric) %>%
log2()
# write csv to manually edit
write.csv(log2_features_testforQCcorr,
"notame_dfs/feaures_test.csv",
row.names = TRUE)Import corrected df (edited so that mz_rt could rowname 1)
# for some reason the r is not recognizing my wd so I'll run file.choose
features_forQCcorr <- read.csv("notame_dfs/feaures_forQCcorr.csv",
header = FALSE,
row.names = 1)
features_forQCcorr <- features_forQCcorr %>%
rownames_to_column(var = "mz_rt") %>%
row_to_names(row_number = 1)%>%
separate(col = mz_rt,
into = c("mz", "rt"),
sep = "_")
write.csv(features_forQCcorr,
"notame_dfs/features_forQCcorr_pt2.csv",
row.names = TRUE)# separate sampleID and injection order
pheno_data <- imp_clust[1] %>%
separate(col = sample_ID,
into = c("sample_ID", "injection_order"),
sep = "_C18POS_")
# fix injection numbers to correct number
pheno_data[54, "injection_order"] <- 9
pheno_data[40, "injection_order"] <- 10
pheno_data[13, "injection_order"] <- 26
pheno_data[24, "injection_order"] <- 27
pheno_data[42, "injection_order"] <- 28
pheno_data[32, "injection_order"] <- 29
# make inj order column numeric
pheno_data <- pheno_data %>%
mutate_at("injection_order", as.numeric)
t_pheno_data <- as.data.frame(t(pheno_data))
write.csv(t_pheno_data,
"notame_dfs/pheno_df.csv",
row.names = TRUE)Combine pheno and feature dfs manually in excel to create metaboset df.
#make sure when converting csv to xlsx that you save as a new file, don't just change the name of the file
metaboset <- read_from_excel("notame_dfs/metaboset.xlsx",
split_by = c("column", "Ion mode"))## INFO [2023-06-20 11:08:39] Detecting corner position
## INFO [2023-06-20 11:08:39] Corner detected correctly at row 3, column D
## INFO [2023-06-20 11:08:39]
## Extracting sample information from rows 1 to 3 and columns E to BH
## INFO [2023-06-20 11:08:39] Replacing spaces in sample information column names with underscores (_)
## INFO [2023-06-20 11:08:39] Naming the last column of sample information "Datafile"
## INFO [2023-06-20 11:08:39]
## Extracting feature information from rows 4 to 631 and columns A to D
## INFO [2023-06-20 11:08:39] Creating Split column from column, Ion mode
## INFO [2023-06-20 11:08:39] Feature_ID column not found, creating feature IDs
## INFO [2023-06-20 11:08:39] Identified m/z column mass and retention time column rt
## INFO [2023-06-20 11:08:39] Creating feature IDs from Split, m/z and retention time
## INFO [2023-06-20 11:08:39] Replacing dots (.) in feature information column names with underscores (_)
## INFO [2023-06-20 11:08:39]
## Extracting feature abundances from rows 4 to 631 and columns E to BH
## INFO [2023-06-20 11:08:39]
## Checking sample information
## INFO [2023-06-20 11:08:39] QC column generated from rows containing 'QC'
## INFO [2023-06-20 11:08:39] Sample ID autogenerated from injection orders and prefix ID_
## INFO [2023-06-20 11:08:39] Checking that feature abundances only contain numeric values
## INFO [2023-06-20 11:08:39]
## Checking feature information
## INFO [2023-06-20 11:08:39] Checking that feature IDs are unique and not stored as numbers
## INFO [2023-06-20 11:08:39] Checking that m/z and retention time values are reasonable
#construct Metaboset
modes <- construct_metabosets(exprs = metaboset$exprs,
pheno_data = metaboset$pheno_data,
feature_data = metaboset$feature_data, group_col = "Class")## Initializing the object(s) with unflagged features
## INFO [2023-06-20 11:08:39]
## Checking feature information
## INFO [2023-06-20 11:08:39] Checking that feature IDs are unique and not stored as numbers
## INFO [2023-06-20 11:08:39] Checking that feature abundances only contain numeric values
## INFO [2023-06-20 11:08:39] Setting row and column names of exprs based on feature and pheno data
#extract each mode into a single object
mode <- modes$C18_pos# ordered by injection
(qualityBPs_b4correction <- plot_sample_boxplots(mode, order_by = "Class", title = "Uncorrected feature abundance"))#ordered by class
plot_sample_boxplots(mode, order_by = "Injection_order", title = "Uncorrected feature abundance")drift corrected takes up to 2 minutes
mode <- flag_detection(mode, qc_limit = 0.75, group_limit = 0.8)## INFO [2023-06-20 11:08:42]
## 0% of features flagged for low detection rate
corrected <- correct_drift(mode, log_transform = FALSE)## INFO [2023-06-20 11:08:42]
## Starting drift correction at 2023-06-20 11:08:42.155506
## INFO [2023-06-20 11:08:43] Drift correction performed at 2023-06-20 11:08:43.714497
## INFO [2023-06-20 11:08:44] Inspecting drift correction results 2023-06-20 11:08:44.485455
## INFO [2023-06-20 11:08:45] Drift correction results inspected at 2023-06-20 11:08:45.992644
## INFO [2023-06-20 11:08:45]
## Drift correction results inspected, report:
## Drift_corrected: 100%
output is percent of the features that were drift corrected. The remaining “low-quality” percent represents features for which the DC did not improve the RSD and D-ratio of the original data.
inspected <- inspect_dc(orig = mode, dc = corrected, check_quality = TRUE)## INFO [2023-06-20 11:08:46] Inspecting drift correction results 2023-06-20 11:08:46.815107
## INFO [2023-06-20 11:08:49] Drift correction results inspected at 2023-06-20 11:08:49.378437
## INFO [2023-06-20 11:08:49]
## Drift correction results inspected, report:
## Drift_corrected: 61%, Low_quality: 39%
(qualityBPS_driftcorrection <- plot_sample_boxplots(corrected, order_by = "Class", title = "Corrected feature abundance"))plot_sample_boxplots(corrected, order_by = "Injection_order", title = "Corrected feature abundance")(qualityBPs_compared <- ggarrange(qualityBPs_b4correction, qualityBPS_driftcorrection,
ncol = 1, nrow = 2))write_to_excel(corrected, "notame_dfs/metaboset_corrected.xlsx")metabdata_corrected <- read.csv(file = "notame_dfs/metaboset_corrected_editedforR.csv",
check.names = FALSE)metabdata_corrected_MZ_RT <- metabdata_corrected %>%
mutate(mass = round(metabdata_corrected$mass, digits = 4), # Decrease number of decimals for m/z & rt
rt = round(metabdata_corrected$rt, digits = 3),
.before=1,
.keep="unused") %>%
unite(mz_rt, c(mass, rt), remove=TRUE) # Combine m/z & rt with _ in betweenmetabdata_corrected_t <- as.data.frame(t(metabdata_corrected_MZ_RT)) %>%
row_to_names(row_number = "find_header") %>% # make MZ_RT column names
rownames_to_column(var = "subj_period") # change rownames to column 1I want the new drift corrected (DC) df to look just like “imp_metabind_clust_log2” df
# go back to wide data
imp_metabind_clust_log2 <- imp_metabind_clust_tidy_log2 %>%
dplyr::select(!rel_abund) %>%
pivot_wider(names_from = mz_rt,
values_from = rel_abund_log2)# combine subject and period columns from imp_metabind_clust_log2 in order to mimic DC df
subj_per_imp_metabind_clust_log2 <- imp_metabind_clust_log2 %>%
unite(subj_period, c(Subject, Period), remove = FALSE)
# place new DC observations in
DC_imp_metabind_clust_log2 <- full_join(subj_per_imp_metabind_clust_log2[,c(1:12)], metabdata_corrected_t, by = "subj_period")
# take out old QC observations
DC_imp_metabind_clust_log2 <- DC_imp_metabind_clust_log2 %>%
filter(subj_period != "QC_QC")
# replace NAs in columns for QCs
DC_imp_metabind_clust_log2[is.na(DC_imp_metabind_clust_log2)] <- "QC"Data after drift correction
DC_imp_metabind_clust_log2 <- DC_imp_metabind_clust_log2 %>%
dplyr::select(!"subj_period") %>%
mutate_at(-c(1:11), as.numeric)PCA.DC_imp_metabind_clust_log2 <- PCA(DC_imp_metabind_clust_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
kable(summary(PCA.DC_imp_metabind_clust_log2))##
## Call:
## PCA(X = DC_imp_metabind_clust_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 683.902 247.574 104.066 71.119 61.213 50.743 41.782
## % of var. 42.958 15.551 6.537 4.467 3.845 3.187 2.624
## Cumulative % of var. 42.958 58.509 65.045 69.513 73.357 76.545 79.169
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 34.622 28.155 25.591 23.197 21.062 17.007 16.217
## % of var. 2.175 1.768 1.607 1.457 1.323 1.068 1.019
## Cumulative % of var. 81.344 83.112 84.720 86.177 87.500 88.568 89.587
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 12.253 11.014 10.922 9.426 9.061 8.556 7.560
## % of var. 0.770 0.692 0.686 0.592 0.569 0.537 0.475
## Cumulative % of var. 90.356 91.048 91.734 92.326 92.896 93.433 93.908
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 6.762 6.667 6.434 5.911 5.234 5.011 4.769
## % of var. 0.425 0.419 0.404 0.371 0.329 0.315 0.300
## Cumulative % of var. 94.332 94.751 95.155 95.527 95.855 96.170 96.470
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.426 4.391 4.032 3.758 3.644 3.348 3.058
## % of var. 0.278 0.276 0.253 0.236 0.229 0.210 0.192
## Cumulative % of var. 96.748 97.024 97.277 97.513 97.742 97.952 98.144
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 3.027 2.796 2.600 2.505 2.456 2.352 2.259
## % of var. 0.190 0.176 0.163 0.157 0.154 0.148 0.142
## Cumulative % of var. 98.334 98.510 98.673 98.831 98.985 99.133 99.274
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47 Dim.48 Dim.49
## Variance 2.131 1.959 1.782 1.670 1.499 1.445 0.318
## % of var. 0.134 0.123 0.112 0.105 0.094 0.091 0.020
## Cumulative % of var. 99.408 99.531 99.643 99.748 99.842 99.933 99.953
## Dim.50 Dim.51 Dim.52 Dim.53 Dim.54 Dim.55
## Variance 0.191 0.178 0.158 0.148 0.073 0.000
## % of var. 0.012 0.011 0.010 0.009 0.005 0.000
## Cumulative % of var. 99.965 99.976 99.986 99.995 100.000 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2
## 1 | 29.831 | -20.616 1.110 0.478 | -5.138
## 2 | 27.573 | -20.316 1.078 0.543 | -5.252
## 3 | 30.832 | -10.133 0.268 0.108 | -1.549
## 4 | 29.309 | -19.939 1.038 0.463 | -2.943
## 5 | 29.451 | -17.509 0.800 0.353 | -3.955
## 6 | 65.518 | 48.818 6.223 0.555 | -34.458
## 7 | 32.386 | -17.632 0.812 0.296 | -2.927
## 8 | 27.196 | -14.505 0.549 0.284 | -4.137
## 9 | 25.206 | -13.566 0.481 0.290 | -3.434
## 10 | 31.113 | -13.405 0.469 0.186 | -1.236
## ctr cos2 Dim.3 ctr cos2
## 1 0.190 0.030 | -2.862 0.141 0.009 |
## 2 0.199 0.036 | -4.679 0.376 0.029 |
## 3 0.017 0.003 | -0.393 0.003 0.000 |
## 4 0.062 0.010 | -2.200 0.083 0.006 |
## 5 0.113 0.018 | -3.079 0.163 0.011 |
## 6 8.564 0.277 | -15.383 4.061 0.055 |
## 7 0.062 0.008 | -2.138 0.078 0.004 |
## 8 0.123 0.023 | -3.382 0.196 0.015 |
## 9 0.085 0.019 | 3.566 0.218 0.020 |
## 10 0.011 0.002 | -7.102 0.866 0.052 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2
## 226.9516_0.553 | -0.060 0.001 0.014 | 0.106 0.005 0.043 |
## 159.1492_0.608 | -0.520 0.040 0.329 | -0.298 0.036 0.108 |
## 175.1442_0.616 | 0.032 0.000 0.001 | 0.341 0.047 0.159 |
## 189.1684_0.616 | -0.053 0.000 0.007 | 0.008 0.000 0.000 |
## 189.16_0.615 | -0.023 0.000 0.002 | -0.006 0.000 0.000 |
## 156.0769_0.621 | -0.011 0.000 0.000 | -0.071 0.002 0.007 |
## 170.0926_0.62 | -0.104 0.002 0.012 | -0.052 0.001 0.003 |
## 136.0482_0.633 | 0.014 0.000 0.000 | -0.037 0.001 0.003 |
## 137.071_0.636 | 0.039 0.000 0.006 | 0.141 0.008 0.071 |
## 162.1126_0.642 | 0.106 0.002 0.010 | -0.216 0.019 0.041 |
## Dim.3 ctr cos2
## 226.9516_0.553 0.032 0.001 0.004 |
## 159.1492_0.608 -0.081 0.006 0.008 |
## 175.1442_0.616 -0.195 0.036 0.052 |
## 189.1684_0.616 0.140 0.019 0.051 |
## 189.16_0.615 -0.058 0.003 0.011 |
## 156.0769_0.621 -0.027 0.001 0.001 |
## 170.0926_0.62 0.288 0.080 0.089 |
## 136.0482_0.633 0.068 0.004 0.011 |
## 137.071_0.636 0.012 0.000 0.001 |
## 162.1126_0.642 0.274 0.072 0.066 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2
## sample_ID_6101_U1_C18POS_59 | 29.831 | -20.616 0.478 -0.788 | -5.138
## sample_ID_6101_U2_C18POS_30 | 33.739 | -15.996 0.225 -0.612 | -3.239
## sample_ID_6101_U3_C18POS_29_1 | 28.213 | -19.582 0.482 -0.749 | -4.777
## sample_ID_6101_U4_C18POS_14 | 29.307 | -19.516 0.443 -0.746 | -4.083
## sample_ID_6102_U1_C18POS_26_1 | 27.573 | -20.316 0.543 -0.777 | -5.252
## sample_ID_6102_U2_C18POS_16 | 26.916 | -15.644 0.338 -0.598 | -2.420
## sample_ID_6102_U3_C18POS_48 | 35.629 | -13.369 0.141 -0.511 | 0.584
## sample_ID_6102_U4_C18POS_50 | 26.846 | -15.989 0.355 -0.611 | -2.253
## sample_ID_6103_U1_C18POS_21 | 30.832 | -10.133 0.108 -0.387 | -1.549
## sample_ID_6103_U2_C18POS_60 | 33.584 | -14.463 0.185 -0.553 | -1.448
## cos2 v.test Dim.3 cos2 v.test
## sample_ID_6101_U1_C18POS_59 0.030 -0.327 | -2.862 0.009 -0.281 |
## sample_ID_6101_U2_C18POS_30 0.009 -0.206 | 8.201 0.059 0.804 |
## sample_ID_6101_U3_C18POS_29_1 0.029 -0.304 | -2.304 0.007 -0.226 |
## sample_ID_6101_U4_C18POS_14 0.019 -0.260 | -2.065 0.005 -0.202 |
## sample_ID_6102_U1_C18POS_26_1 0.036 -0.334 | -4.679 0.029 -0.459 |
## sample_ID_6102_U2_C18POS_16 0.008 -0.154 | 7.097 0.070 0.696 |
## sample_ID_6102_U3_C18POS_48 0.000 0.037 | 1.997 0.003 0.196 |
## sample_ID_6102_U4_C18POS_50 0.007 -0.143 | -1.443 0.003 -0.141 |
## sample_ID_6103_U1_C18POS_21 0.003 -0.098 | -0.393 0.000 -0.038 |
## sample_ID_6103_U2_C18POS_60 0.002 -0.092 | 8.967 0.071 0.879 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| sample_ID_6101_U1_C18POS_59 | | | 29.831 | | | -20.616 | 0.478 | -0.788 | | | -5.138 | 0.030 | -0.327 | | | -2.862 | 0.009 | -0.281 | | |
| sample_ID_6101_U2_C18POS_30 | | | 33.739 | | | -15.996 | 0.225 | -0.612 | | | -3.239 | 0.009 | -0.206 | | | 8.201 | 0.059 | 0.804 | | |
| sample_ID_6101_U3_C18POS_29_1 | | | 28.213 | | | -19.582 | 0.482 | -0.749 | | | -4.777 | 0.029 | -0.304 | | | -2.304 | 0.007 | -0.226 | | |
| sample_ID_6101_U4_C18POS_14 | | | 29.307 | | | -19.516 | 0.443 | -0.746 | | | -4.083 | 0.019 | -0.260 | | | -2.065 | 0.005 | -0.202 | | |
| sample_ID_6102_U1_C18POS_26_1 | | | 27.573 | | | -20.316 | 0.543 | -0.777 | | | -5.252 | 0.036 | -0.334 | | | -4.679 | 0.029 | -0.459 | | |
| sample_ID_6102_U2_C18POS_16 | | | 26.916 | | | -15.644 | 0.338 | -0.598 | | | -2.420 | 0.008 | -0.154 | | | 7.097 | 0.070 | 0.696 | | |
| sample_ID_6102_U3_C18POS_48 | | | 35.629 | | | -13.369 | 0.141 | -0.511 | | | 0.584 | 0.000 | 0.037 | | | 1.997 | 0.003 | 0.196 | | |
| sample_ID_6102_U4_C18POS_50 | | | 26.846 | | | -15.989 | 0.355 | -0.611 | | | -2.253 | 0.007 | -0.143 | | | -1.443 | 0.003 | -0.141 | | |
| sample_ID_6103_U1_C18POS_21 | | | 30.832 | | | -10.133 | 0.108 | -0.387 | | | -1.549 | 0.003 | -0.098 | | | -0.393 | 0.000 | -0.038 | | |
| sample_ID_6103_U2_C18POS_60 | | | 33.584 | | | -14.463 | 0.185 | -0.553 | | | -1.448 | 0.002 | -0.092 | | | 8.967 | 0.071 | 0.879 | | |
# pull PC coordinates into df
PC_coord_QC_log2 <- as.data.frame(PCA.DC_imp_metabind_clust_log2$ind$coord)
# bind back metadata from cols 1-10
PC_coord_QC_log2 <- bind_cols(DC_imp_metabind_clust_log2[,1:11], PC_coord_QC_log2)
# grab some variance explained
importance_QC <- PCA.DC_imp_metabind_clust_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_withQC <- round(importance_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_withQC <- round(importance_QC[2,2], 2)Using FactoExtra package
# scree plot
fviz_eig(PCA.DC_imp_metabind_clust_log2)# get eigenvalues
kable(get_eig(PCA.DC_imp_metabind_clust_log2))| eigenvalue | variance.percent | cumulative.variance.percent | |
|---|---|---|---|
| Dim.1 | 683.9018350 | 42.9578232 | 42.95782 |
| Dim.2 | 247.5739259 | 15.5508238 | 58.50865 |
| Dim.3 | 104.0663840 | 6.5367061 | 65.04535 |
| Dim.4 | 71.1186431 | 4.4671646 | 69.51252 |
| Dim.5 | 61.2125083 | 3.8449321 | 73.35745 |
| Dim.6 | 50.7432449 | 3.1873278 | 76.54478 |
| Dim.7 | 41.7815783 | 2.6244200 | 79.16920 |
| Dim.8 | 34.6220192 | 2.1747077 | 81.34391 |
| Dim.9 | 28.1545052 | 1.7684647 | 83.11237 |
| Dim.10 | 25.5909303 | 1.6074393 | 84.71981 |
| Dim.11 | 23.1967496 | 1.4570539 | 86.17686 |
| Dim.12 | 21.0622589 | 1.3229805 | 87.49984 |
| Dim.13 | 17.0067675 | 1.0682435 | 88.56809 |
| Dim.14 | 16.2173468 | 1.0186578 | 89.58675 |
| Dim.15 | 12.2532775 | 0.7696633 | 90.35641 |
| Dim.16 | 11.0141758 | 0.6918318 | 91.04824 |
| Dim.17 | 10.9220212 | 0.6860433 | 91.73428 |
| Dim.18 | 9.4259369 | 0.5920700 | 92.32635 |
| Dim.19 | 9.0611230 | 0.5691550 | 92.89551 |
| Dim.20 | 8.5555767 | 0.5374002 | 93.43291 |
| Dim.21 | 7.5595219 | 0.4748351 | 93.90774 |
| Dim.22 | 6.7618090 | 0.4247285 | 94.33247 |
| Dim.23 | 6.6673590 | 0.4187958 | 94.75127 |
| Dim.24 | 6.4343824 | 0.4041619 | 95.15543 |
| Dim.25 | 5.9106377 | 0.3712640 | 95.52669 |
| Dim.26 | 5.2343762 | 0.3287861 | 95.85548 |
| Dim.27 | 5.0110426 | 0.3147579 | 96.17024 |
| Dim.28 | 4.7692869 | 0.2995725 | 96.46981 |
| Dim.29 | 4.4255192 | 0.2779795 | 96.74779 |
| Dim.30 | 4.3907598 | 0.2757961 | 97.02359 |
| Dim.31 | 4.0323478 | 0.2532833 | 97.27687 |
| Dim.32 | 3.7576956 | 0.2360316 | 97.51290 |
| Dim.33 | 3.6443434 | 0.2289116 | 97.74181 |
| Dim.34 | 3.3478394 | 0.2102873 | 97.95210 |
| Dim.35 | 3.0581248 | 0.1920895 | 98.14419 |
| Dim.36 | 3.0269329 | 0.1901303 | 98.33432 |
| Dim.37 | 2.7960437 | 0.1756275 | 98.50995 |
| Dim.38 | 2.5999216 | 0.1633085 | 98.67326 |
| Dim.39 | 2.5045627 | 0.1573187 | 98.83057 |
| Dim.40 | 2.4556572 | 0.1542468 | 98.98482 |
| Dim.41 | 2.3524285 | 0.1477627 | 99.13258 |
| Dim.42 | 2.2591637 | 0.1419045 | 99.27449 |
| Dim.43 | 2.1308919 | 0.1338474 | 99.40834 |
| Dim.44 | 1.9585097 | 0.1230196 | 99.53136 |
| Dim.45 | 1.7816994 | 0.1119136 | 99.64327 |
| Dim.46 | 1.6695074 | 0.1048665 | 99.74814 |
| Dim.47 | 1.4992727 | 0.0941736 | 99.84231 |
| Dim.48 | 1.4451116 | 0.0907716 | 99.93308 |
| Dim.49 | 0.3180101 | 0.0199751 | 99.95306 |
| Dim.50 | 0.1910025 | 0.0119974 | 99.96505 |
| Dim.51 | 0.1781614 | 0.0111908 | 99.97624 |
| Dim.52 | 0.1576564 | 0.0099028 | 99.98615 |
| Dim.53 | 0.1477544 | 0.0092809 | 99.99543 |
| Dim.54 | 0.0727906 | 0.0045722 | 100.00000 |
| Dim.55 | 0.0000037 | 0.0000002 | 100.00000 |
# scores plot
fviz_pca_ind(PCA.DC_imp_metabind_clust_log2)# manual scores plot
(PCA_withQCs <- PC_coord_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_withQC/PC1_withQC) +
labs(x = glue::glue("PC1: {PC1_withQC}%"),
y = glue::glue("PC2: {PC2_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data"))DC_imp_metabind_clust_log2_noQCs <- DC_imp_metabind_clust_log2 %>%
filter(Intervention != "QC")
PCA.DC_imp_metabind_clust_log2_noQCs <- PCA(DC_imp_metabind_clust_log2_noQCs, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.DC_imp_metabind_clust_log2_noQCs))##
## Call:
## PCA(X = DC_imp_metabind_clust_log2_noQCs, scale.unit = FALSE,
## quali.sup = 1:11, graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 441.833 212.422 85.731 80.113 59.413 51.306 45.168
## % of var. 33.213 15.968 6.444 6.022 4.466 3.857 3.395
## Cumulative % of var. 33.213 49.180 55.625 61.647 66.113 69.970 73.365
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 34.113 29.960 27.706 25.293 19.946 18.931 17.610
## % of var. 2.564 2.252 2.083 1.901 1.499 1.423 1.324
## Cumulative % of var. 75.929 78.181 80.264 82.165 83.665 85.088 86.411
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 13.981 12.769 11.055 10.570 10.091 9.072 8.181
## % of var. 1.051 0.960 0.831 0.795 0.759 0.682 0.615
## Cumulative % of var. 87.462 88.422 89.253 90.048 90.806 91.488 92.103
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 7.888 7.504 6.922 6.385 5.991 5.614 5.164
## % of var. 0.593 0.564 0.520 0.480 0.450 0.422 0.388
## Cumulative % of var. 92.696 93.260 93.780 94.260 94.711 95.133 95.521
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 5.135 4.702 4.388 4.250 3.904 3.570 3.553
## % of var. 0.386 0.353 0.330 0.319 0.293 0.268 0.267
## Cumulative % of var. 95.907 96.260 96.590 96.910 97.203 97.471 97.739
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 3.338 3.092 2.969 2.906 2.788 2.652 2.510
## % of var. 0.251 0.232 0.223 0.218 0.210 0.199 0.189
## Cumulative % of var. 97.989 98.222 98.445 98.664 98.873 99.072 99.261
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 2.296 2.110 1.953 1.767 1.704
## % of var. 0.173 0.159 0.147 0.133 0.128
## Cumulative % of var. 99.434 99.592 99.739 99.872 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 24.673 | -9.190 0.398 0.139 | -7.502 0.552 0.092 |
## 2 | 21.767 | -8.370 0.330 0.148 | -6.285 0.387 0.083 |
## 3 | 29.267 | -2.553 0.031 0.008 | -1.485 0.022 0.003 |
## 4 | 24.488 | -9.865 0.459 0.162 | -5.820 0.332 0.056 |
## 5 | 25.247 | -7.014 0.232 0.077 | -4.946 0.240 0.038 |
## 6 | 70.409 | 66.408 20.794 0.890 | 2.369 0.055 0.001 |
## 7 | 28.585 | -7.483 0.264 0.069 | -4.034 0.160 0.020 |
## 8 | 23.553 | -4.287 0.087 0.033 | -3.661 0.131 0.024 |
## 9 | 21.985 | -4.779 0.108 0.047 | -5.390 0.285 0.060 |
## 10 | 28.428 | -4.512 0.096 0.025 | 0.271 0.001 0.000 |
## Dim.3 ctr cos2
## 1 -9.728 2.300 0.155 |
## 2 -8.551 1.777 0.154 |
## 3 -0.811 0.016 0.001 |
## 4 -7.909 1.520 0.104 |
## 5 -5.180 0.652 0.042 |
## 6 -8.327 1.685 0.014 |
## 7 -9.038 1.985 0.100 |
## 8 -3.914 0.372 0.028 |
## 9 5.090 0.630 0.054 |
## 10 -7.668 1.429 0.073 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 226.9516_0.553 | -0.162 0.006 0.088 | -0.001 0.000 0.000 | -0.044
## 159.1492_0.608 | -0.083 0.002 0.012 | -0.236 0.026 0.094 | -0.091
## 175.1442_0.616 | -0.124 0.004 0.018 | 0.419 0.083 0.207 | -0.022
## 189.1684_0.616 | -0.087 0.002 0.017 | -0.089 0.004 0.018 | 0.060
## 189.16_0.615 | -0.036 0.000 0.004 | -0.024 0.000 0.002 | -0.105
## 156.0769_0.621 | -0.039 0.000 0.002 | -0.145 0.010 0.024 | -0.179
## 170.0926_0.62 | -0.208 0.010 0.040 | -0.333 0.052 0.103 | -0.104
## 136.0482_0.633 | 0.007 0.000 0.000 | -0.078 0.003 0.012 | -0.025
## 137.071_0.636 | -0.056 0.001 0.010 | 0.125 0.007 0.049 | -0.003
## 162.1126_0.642 | 0.050 0.001 0.002 | -0.412 0.080 0.133 | -0.265
## ctr cos2
## 226.9516_0.553 0.002 0.006 |
## 159.1492_0.608 0.010 0.014 |
## 175.1442_0.616 0.001 0.001 |
## 189.1684_0.616 0.004 0.008 |
## 189.16_0.615 0.013 0.031 |
## 156.0769_0.621 0.037 0.036 |
## 170.0926_0.62 0.013 0.010 |
## 136.0482_0.633 0.001 0.001 |
## 137.071_0.636 0.000 0.000 |
## 162.1126_0.642 0.082 0.055 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2 v.test
## 6101_U1_C18POS_59 | 24.673 | -9.190 0.139 -0.437 | -7.502 0.092 -0.515 |
## 6101_U2_C18POS_30 | 31.022 | -7.697 0.062 -0.366 | -8.210 0.070 -0.563 |
## 6101_U3_C18POS_29_1 | 23.104 | -8.563 0.137 -0.407 | -6.971 0.091 -0.478 |
## 6101_U4_C18POS_14 | 24.577 | -9.044 0.135 -0.430 | -6.798 0.077 -0.466 |
## 6102_U1_C18POS_26_1 | 21.767 | -8.370 0.148 -0.398 | -6.285 0.083 -0.431 |
## 6102_U2_C18POS_16 | 23.410 | -7.332 0.098 -0.349 | -6.343 0.073 -0.435 |
## 6102_U3_C18POS_48 | 33.928 | -7.323 0.047 -0.348 | -3.185 0.009 -0.219 |
## 6102_U4_C18POS_50 | 23.082 | -7.190 0.097 -0.342 | -4.348 0.035 -0.298 |
## 6103_U1_C18POS_21 | 29.267 | -2.553 0.008 -0.121 | -1.485 0.003 -0.102 |
## 6103_U2_C18POS_60 | 31.321 | -7.237 0.053 -0.344 | -6.156 0.039 -0.422 |
## Dim.3 cos2 v.test
## 6101_U1_C18POS_59 -9.728 0.155 -1.051 |
## 6101_U2_C18POS_30 8.542 0.076 0.923 |
## 6101_U3_C18POS_29_1 -7.504 0.105 -0.810 |
## 6101_U4_C18POS_14 -8.527 0.120 -0.921 |
## 6102_U1_C18POS_26_1 -8.551 0.154 -0.923 |
## 6102_U2_C18POS_16 11.340 0.235 1.225 |
## 6102_U3_C18POS_48 -8.579 0.064 -0.927 |
## 6102_U4_C18POS_50 -7.807 0.114 -0.843 |
## 6103_U1_C18POS_21 -0.811 0.001 -0.088 |
## 6103_U2_C18POS_60 14.503 0.214 1.566 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_C18POS_59 | | | 24.673 | | | -9.190 | 0.139 | -0.437 | | | -7.502 | 0.092 | -0.515 | | | -9.728 | 0.155 | -1.051 | | |
| 6101_U2_C18POS_30 | | | 31.022 | | | -7.697 | 0.062 | -0.366 | | | -8.210 | 0.070 | -0.563 | | | 8.542 | 0.076 | 0.923 | | |
| 6101_U3_C18POS_29_1 | | | 23.104 | | | -8.563 | 0.137 | -0.407 | | | -6.971 | 0.091 | -0.478 | | | -7.504 | 0.105 | -0.810 | | |
| 6101_U4_C18POS_14 | | | 24.577 | | | -9.044 | 0.135 | -0.430 | | | -6.798 | 0.077 | -0.466 | | | -8.527 | 0.120 | -0.921 | | |
| 6102_U1_C18POS_26_1 | | | 21.767 | | | -8.370 | 0.148 | -0.398 | | | -6.285 | 0.083 | -0.431 | | | -8.551 | 0.154 | -0.923 | | |
| 6102_U2_C18POS_16 | | | 23.410 | | | -7.332 | 0.098 | -0.349 | | | -6.343 | 0.073 | -0.435 | | | 11.340 | 0.235 | 1.225 | | |
| 6102_U3_C18POS_48 | | | 33.928 | | | -7.323 | 0.047 | -0.348 | | | -3.185 | 0.009 | -0.219 | | | -8.579 | 0.064 | -0.927 | | |
| 6102_U4_C18POS_50 | | | 23.082 | | | -7.190 | 0.097 | -0.342 | | | -4.348 | 0.035 | -0.298 | | | -7.807 | 0.114 | -0.843 | | |
| 6103_U1_C18POS_21 | | | 29.267 | | | -2.553 | 0.008 | -0.121 | | | -1.485 | 0.003 | -0.102 | | | -0.811 | 0.001 | -0.088 | | |
| 6103_U2_C18POS_60 | | | 31.321 | | | -7.237 | 0.053 | -0.344 | | | -6.156 | 0.039 | -0.422 | | | 14.503 | 0.214 | 1.566 | | |
# pull PC coordinates into df
PC_coord_noQCs_log2 <- as.data.frame(PCA.DC_imp_metabind_clust_log2_noQCs$ind$coord)
# bind back metadata from cols 1-10
PC_coord_noQCs_log2 <- bind_cols(DC_imp_metabind_clust_log2_noQCs[,1:11], PC_coord_noQCs_log2)
# grab some variance explained
importance_noQC <- PCA.DC_imp_metabind_clust_log2_noQCs$eig
# set variance explained with PC1, round to 2 digits
PC1_noQC <- round(importance_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_noQC <- round(importance_noQC[2,2], 2)Using FactoExtra
# scree plot
fviz_eig(PCA.DC_imp_metabind_clust_log2_noQCs)# scores plot
fviz_pca_ind(PCA.DC_imp_metabind_clust_log2_noQCs)# plot of contributions from features to PC1
(var_contrib_noQCs_PC1 <- fviz_contrib(PCA.DC_imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 1,
top = 25,
title = "Var contribution to PC1: no QCs"))# plot of contributions from features to PC2
(var_contrib_noQCs_PC2 <- fviz_contrib(PCA.DC_imp_metabind_clust_log2_noQCs,
choice = "var",
axes = 2,
top = 25,
title = "Var contribution to PC2: no QCs"))(PCA_withoutQCs <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gold", "tomato1")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, without QCs"))ggplotly(PCA_withoutQCs)(PCA_withoutQCs.pre_post <- PC_coord_noQCs_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_noQC/PC1_noQC) +
labs(x = glue::glue("PC1: {PC1_noQC}%"),
y = glue::glue("PC2: {PC2_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed, without QCs"))ggplotly(PCA_withoutQCs.pre_post,
tooltip = "text") Looks like subject 6106 and 6112 are outliers
# go back to wide data
DC_imp_nooutliers_log2 <- DC_imp_metabind_clust_log2 %>%
filter(Subject != 6106,
Subject != 6112)
PCA.DC_imp_nooutliers_log2 <- PCA(DC_imp_nooutliers_log2, # wide data
quali.sup = 1:11, # remove qualitative variables
graph = FALSE, # don't graph
scale.unit = FALSE) # don't scale, already transformed data
# PCA summary
summary(PCA.DC_imp_nooutliers_log2)##
## Call:
## PCA(X = DC_imp_nooutliers_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 628.012 80.003 76.073 56.124 49.726 40.573 34.933
## % of var. 50.664 6.454 6.137 4.528 4.012 3.273 2.818
## Cumulative % of var. 50.664 57.119 63.256 67.783 71.795 75.068 77.886
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 27.643 26.037 24.397 18.719 17.907 12.987 12.775
## % of var. 2.230 2.101 1.968 1.510 1.445 1.048 1.031
## Cumulative % of var. 80.117 82.217 84.185 85.695 87.140 88.188 89.218
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 10.779 10.439 8.938 8.527 7.977 7.597 7.114
## % of var. 0.870 0.842 0.721 0.688 0.643 0.613 0.574
## Cumulative % of var. 90.088 90.930 91.651 92.339 92.982 93.595 94.169
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 6.444 5.977 5.503 5.196 5.152 4.836 4.582
## % of var. 0.520 0.482 0.444 0.419 0.416 0.390 0.370
## Cumulative % of var. 94.689 95.171 95.615 96.035 96.450 96.840 97.210
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.162 3.588 3.311 3.170 2.926 2.889 2.644
## % of var. 0.336 0.289 0.267 0.256 0.236 0.233 0.213
## Cumulative % of var. 97.546 97.835 98.102 98.358 98.594 98.827 99.040
## Dim.36 Dim.37 Dim.38 Dim.39 Dim.40 Dim.41 Dim.42
## Variance 2.452 2.321 2.064 2.012 1.763 0.395 0.230
## % of var. 0.198 0.187 0.167 0.162 0.142 0.032 0.019
## Cumulative % of var. 99.238 99.425 99.592 99.754 99.896 99.928 99.947
## Dim.43 Dim.44 Dim.45 Dim.46 Dim.47
## Variance 0.210 0.187 0.174 0.088 0.000
## % of var. 0.017 0.015 0.014 0.007 0.000
## Cumulative % of var. 99.964 99.979 99.993 100.000 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2
## 1 | 27.391 | -17.588 1.026 0.412 | 4.065
## 2 | 25.234 | -17.617 1.030 0.487 | 2.005
## 3 | 29.723 | -6.298 0.132 0.045 | -9.076
## 4 | 26.846 | -16.311 0.883 0.369 | -0.254
## 5 | 27.493 | -14.359 0.684 0.273 | -0.246
## 6 | 30.688 | -14.378 0.686 0.219 | 11.126
## 7 | 25.533 | -11.501 0.439 0.203 | -2.691
## 8 | 23.059 | -9.879 0.324 0.184 | -4.698
## 9 | 30.043 | -10.063 0.336 0.112 | -0.348
## 10 | 25.813 | -9.788 0.318 0.144 | -1.289
## ctr cos2 Dim.3 ctr cos2
## 1 0.430 0.022 | 6.867 1.291 0.063 |
## 2 0.105 0.006 | 6.771 1.255 0.072 |
## 3 2.145 0.093 | 6.240 1.066 0.044 |
## 4 0.002 0.000 | 7.255 1.441 0.073 |
## 5 0.002 0.000 | 4.425 0.536 0.026 |
## 6 3.224 0.131 | 4.961 0.674 0.026 |
## 7 0.189 0.011 | 5.715 0.894 0.050 |
## 8 0.575 0.042 | -4.735 0.614 0.042 |
## 9 0.003 0.000 | 9.683 2.567 0.104 |
## 10 0.043 0.002 | 4.413 0.533 0.029 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2
## 226.9516_0.553 | 0.018 0.000 0.001 | 0.002 0.000 0.000 |
## 159.1492_0.608 | -0.630 0.063 0.468 | 0.219 0.060 0.057 |
## 175.1442_0.616 | 0.098 0.002 0.016 | 0.271 0.092 0.125 |
## 189.1684_0.616 | -0.010 0.000 0.000 | 0.157 0.031 0.062 |
## 189.16_0.615 | -0.012 0.000 0.001 | 0.015 0.000 0.001 |
## 156.0769_0.621 | 0.016 0.000 0.000 | 0.030 0.001 0.001 |
## 170.0926_0.62 | 0.016 0.000 0.000 | 0.291 0.106 0.112 |
## 136.0482_0.633 | 0.016 0.000 0.001 | -0.017 0.000 0.001 |
## 137.071_0.636 | 0.071 0.001 0.019 | 0.078 0.008 0.023 |
## 162.1126_0.642 | 0.160 0.004 0.029 | 0.196 0.048 0.044 |
## Dim.3 ctr cos2
## 226.9516_0.553 0.023 0.001 0.002 |
## 159.1492_0.608 -0.008 0.000 0.000 |
## 175.1442_0.616 -0.103 0.014 0.018 |
## 189.1684_0.616 -0.194 0.050 0.095 |
## 189.16_0.615 0.009 0.000 0.000 |
## 156.0769_0.621 0.067 0.006 0.006 |
## 170.0926_0.62 -0.194 0.050 0.050 |
## 136.0482_0.633 0.057 0.004 0.007 |
## 137.071_0.636 -0.084 0.009 0.027 |
## 162.1126_0.642 0.085 0.009 0.008 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2
## sample_ID_6101_U1_C18POS_59 | 27.391 | -17.588 0.412 -0.702 | 4.065
## sample_ID_6101_U2_C18POS_30 | 31.486 | -11.643 0.137 -0.465 | -0.871
## sample_ID_6101_U3_C18POS_29_1 | 25.769 | -16.428 0.406 -0.656 | 2.637
## sample_ID_6101_U4_C18POS_14 | 26.832 | -16.086 0.359 -0.642 | 0.501
## sample_ID_6102_U1_C18POS_26_1 | 25.234 | -17.617 0.487 -0.703 | 2.005
## sample_ID_6102_U2_C18POS_16 | 24.321 | -11.377 0.219 -0.454 | -2.726
## sample_ID_6102_U3_C18POS_48 | 33.883 | -8.330 0.060 -0.332 | 2.023
## sample_ID_6102_U4_C18POS_50 | 24.519 | -12.113 0.244 -0.483 | -0.591
## sample_ID_6103_U1_C18POS_21 | 29.723 | -6.298 0.045 -0.251 | -9.076
## sample_ID_6103_U2_C18POS_60 | 31.457 | -9.662 0.094 -0.386 | -12.027
## cos2 v.test Dim.3 cos2 v.test
## sample_ID_6101_U1_C18POS_59 0.022 0.454 | 6.867 0.063 0.787 |
## sample_ID_6101_U2_C18POS_30 0.001 -0.097 | -9.024 0.082 -1.035 |
## sample_ID_6101_U3_C18POS_29_1 0.010 0.295 | 5.131 0.040 0.588 |
## sample_ID_6101_U4_C18POS_14 0.000 0.056 | 7.761 0.084 0.890 |
## sample_ID_6102_U1_C18POS_26_1 0.006 0.224 | 6.771 0.072 0.776 |
## sample_ID_6102_U2_C18POS_16 0.013 -0.305 | -10.800 0.197 -1.238 |
## sample_ID_6102_U3_C18POS_48 0.004 0.226 | 8.833 0.068 1.013 |
## sample_ID_6102_U4_C18POS_50 0.001 -0.066 | 8.896 0.132 1.020 |
## sample_ID_6103_U1_C18POS_21 0.093 -1.015 | 6.240 0.044 0.715 |
## sample_ID_6103_U2_C18POS_60 0.146 -1.345 | -9.620 0.094 -1.103 |
# pull PC coordinates into df
PC_nooutliers_QC_log2 <- as.data.frame(PCA.DC_imp_nooutliers_log2$ind$coord)
# bind back metadata from cols 1-11
PC_nooutliers_QC_log2 <- bind_cols(DC_imp_nooutliers_log2[,1:11], PC_nooutliers_QC_log2)
# grab some variance explained
importance_nooutliers_QC <- PCA.DC_imp_nooutliers_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_withQC <- round(importance_nooutliers_QC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_withQC <- round(importance_nooutliers_QC[2,2], 2)Using FactoExtra package
# scree plot
fviz_eig(PCA.DC_imp_nooutliers_log2)# scores plot
fviz_pca_ind(PCA.DC_imp_nooutliers_log2)##### Red vs yellow
# manual scores plot
(PCA_nooutliers_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1, y = Dim.2,
fill = factor(Intervention, levels = c("Yellow", "Red", "QC")))) +
geom_point(shape = 21, alpha = 0.8) +
scale_fill_manual(values = c("gold", "tomato1", "light grey")) +
scale_color_manual(values = "black") +
theme_minimal() +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "Group",
title = "Principal Components Analysis Scores Plot"))(PCA_nooutliers_prepost_withQCs <- PC_nooutliers_QC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
labs(x = glue::glue("PC1: {PC1_nooutliers_withQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_withQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot"))ggplotly(PCA_nooutliers_prepost_withQCs,
tooltip = "text") imp_nooutliers_noQCs_log2 <- DC_imp_nooutliers_log2 %>%
filter(Intervention != "QC")
PCA.imp_nooutliers_noQCs_log2 <- PCA(imp_nooutliers_noQCs_log2, # wide data
quali.sup=1:11, # remove qualitative variables
graph=FALSE, # don't graph
scale.unit=FALSE) # don't scale, we already did this
# look at summary
kable(summary(PCA.imp_nooutliers_noQCs_log2))##
## Call:
## PCA(X = imp_nooutliers_noQCs_log2, scale.unit = FALSE, quali.sup = 1:11,
## graph = FALSE)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 96.261 93.644 67.815 61.286 51.306 44.034 33.471
## % of var. 12.866 12.516 9.064 8.191 6.857 5.885 4.474
## Cumulative % of var. 12.866 25.382 34.446 42.637 49.495 55.380 59.854
## Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14
## Variance 31.911 29.752 22.786 21.924 17.835 15.445 13.502
## % of var. 4.265 3.977 3.046 2.930 2.384 2.064 1.805
## Cumulative % of var. 64.119 68.096 71.141 74.071 76.455 78.520 80.324
## Dim.15 Dim.16 Dim.17 Dim.18 Dim.19 Dim.20 Dim.21
## Variance 12.614 10.861 10.236 9.571 9.324 8.759 7.869
## % of var. 1.686 1.452 1.368 1.279 1.246 1.171 1.052
## Cumulative % of var. 82.010 83.462 84.830 86.109 87.355 88.526 89.578
## Dim.22 Dim.23 Dim.24 Dim.25 Dim.26 Dim.27 Dim.28
## Variance 7.213 6.699 6.235 6.198 5.803 5.513 5.094
## % of var. 0.964 0.895 0.833 0.828 0.776 0.737 0.681
## Cumulative % of var. 90.542 91.437 92.271 93.099 93.875 94.612 95.293
## Dim.29 Dim.30 Dim.31 Dim.32 Dim.33 Dim.34 Dim.35
## Variance 4.361 3.973 3.855 3.538 3.508 3.182 2.942
## % of var. 0.583 0.531 0.515 0.473 0.469 0.425 0.393
## Cumulative % of var. 95.875 96.406 96.922 97.395 97.863 98.289 98.682
## Dim.36 Dim.37 Dim.38 Dim.39
## Variance 2.810 2.480 2.416 2.156
## % of var. 0.376 0.331 0.323 0.288
## Cumulative % of var. 99.058 99.389 99.712 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 22.177 | 1.491 0.058 0.005 | -9.589 2.455 0.187 |
## 2 | 19.368 | -0.355 0.003 0.000 | -8.720 2.030 0.203 |
## 3 | 29.393 | -9.461 2.325 0.104 | -2.323 0.144 0.006 |
## 4 | 22.105 | -2.507 0.163 0.013 | -8.338 1.856 0.142 |
## 5 | 23.759 | -1.703 0.075 0.005 | -5.259 0.738 0.049 |
## 6 | 27.369 | 9.306 2.249 0.116 | -8.210 1.799 0.090 |
## 7 | 22.789 | -3.784 0.372 0.028 | -4.671 0.583 0.042 |
## 8 | 20.866 | -3.634 0.343 0.030 | 5.419 0.784 0.067 |
## 9 | 28.364 | -2.372 0.146 0.007 | -9.050 2.187 0.102 |
## 10 | 23.978 | -2.290 0.136 0.009 | -4.105 0.450 0.029 |
## Dim.3 ctr cos2
## 1 -4.758 0.835 0.046 |
## 2 -4.139 0.632 0.046 |
## 3 1.185 0.052 0.002 |
## 4 -2.817 0.293 0.016 |
## 5 -1.906 0.134 0.006 |
## 6 2.124 0.166 0.006 |
## 7 2.880 0.306 0.016 |
## 8 -2.610 0.251 0.016 |
## 9 4.774 0.840 0.028 |
## 10 -4.969 0.910 0.043 |
##
## Variables (the 10 first)
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3
## 226.9516_0.553 | -0.013 0.000 0.001 | -0.048 0.002 0.008 | -0.312
## 159.1492_0.608 | 0.218 0.049 0.091 | -0.081 0.007 0.013 | 0.222
## 175.1442_0.616 | 0.318 0.105 0.146 | 0.050 0.003 0.004 | 0.163
## 189.1684_0.616 | 0.210 0.046 0.092 | 0.157 0.026 0.052 | 0.088
## 189.16_0.615 | 0.001 0.000 0.000 | -0.043 0.002 0.005 | -0.069
## 156.0769_0.621 | -0.004 0.000 0.000 | -0.123 0.016 0.017 | 0.063
## 170.0926_0.62 | 0.335 0.117 0.125 | 0.084 0.008 0.008 | -0.207
## 136.0482_0.633 | -0.035 0.001 0.002 | -0.062 0.004 0.007 | -0.339
## 137.071_0.636 | 0.101 0.011 0.033 | 0.064 0.004 0.013 | -0.102
## 162.1126_0.642 | 0.193 0.039 0.037 | -0.129 0.018 0.016 | 0.124
## ctr cos2
## 226.9516_0.553 0.143 0.341 |
## 159.1492_0.608 0.073 0.094 |
## 175.1442_0.616 0.039 0.038 |
## 189.1684_0.616 0.012 0.016 |
## 189.16_0.615 0.007 0.014 |
## 156.0769_0.621 0.006 0.004 |
## 170.0926_0.62 0.063 0.048 |
## 136.0482_0.633 0.169 0.197 |
## 137.071_0.636 0.015 0.034 |
## 162.1126_0.642 0.023 0.015 |
##
## Supplementary categories (the 10 first)
## Dist Dim.1 cos2 v.test Dim.2 cos2 v.test
## 6101_U1_C18POS_59 | 22.177 | 1.491 0.005 0.152 | -9.589 0.187 -0.991 |
## 6101_U2_C18POS_30 | 29.285 | 0.688 0.001 0.070 | 7.942 0.074 0.821 |
## 6101_U3_C18POS_29_1 | 20.734 | 0.603 0.001 0.061 | -7.336 0.125 -0.758 |
## 6101_U4_C18POS_14 | 22.205 | -1.904 0.007 -0.194 | -9.051 0.166 -0.935 |
## 6102_U1_C18POS_26_1 | 19.368 | -0.355 0.000 -0.036 | -8.720 0.203 -0.901 |
## 6102_U2_C18POS_16 | 21.448 | -0.518 0.001 -0.053 | 10.544 0.242 1.090 |
## 6102_U3_C18POS_48 | 32.995 | 0.188 0.000 0.019 | -8.643 0.069 -0.893 |
## 6102_U4_C18POS_50 | 21.415 | -2.611 0.015 -0.266 | -8.582 0.161 -0.887 |
## 6103_U1_C18POS_21 | 29.393 | -9.461 0.104 -0.964 | -2.323 0.006 -0.240 |
## 6103_U2_C18POS_60 | 29.874 | -9.327 0.097 -0.951 | 12.634 0.179 1.306 |
## Dim.3 cos2 v.test
## 6101_U1_C18POS_59 -4.758 0.046 -0.578 |
## 6101_U2_C18POS_30 4.097 0.020 0.497 |
## 6101_U3_C18POS_29_1 -6.740 0.106 -0.818 |
## 6101_U4_C18POS_14 -1.348 0.004 -0.164 |
## 6102_U1_C18POS_26_1 -4.139 0.046 -0.503 |
## 6102_U2_C18POS_16 7.046 0.108 0.856 |
## 6102_U3_C18POS_48 2.545 0.006 0.309 |
## 6102_U4_C18POS_50 7.651 0.128 0.929 |
## 6103_U1_C18POS_21 1.185 0.002 0.144 |
## 6103_U2_C18POS_60 7.989 0.072 0.970 |
| Dist | Dim.1 | cos2 | v.test | Dim.2 | cos2 | v.test | Dim.3 | cos2 | v.test | ||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6101_U1_C18POS_59 | | | 22.177 | | | 1.491 | 0.005 | 0.152 | | | -9.589 | 0.187 | -0.991 | | | -4.758 | 0.046 | -0.578 | | |
| 6101_U2_C18POS_30 | | | 29.285 | | | 0.688 | 0.001 | 0.070 | | | 7.942 | 0.074 | 0.821 | | | 4.097 | 0.020 | 0.497 | | |
| 6101_U3_C18POS_29_1 | | | 20.734 | | | 0.603 | 0.001 | 0.061 | | | -7.336 | 0.125 | -0.758 | | | -6.740 | 0.106 | -0.818 | | |
| 6101_U4_C18POS_14 | | | 22.205 | | | -1.904 | 0.007 | -0.194 | | | -9.051 | 0.166 | -0.935 | | | -1.348 | 0.004 | -0.164 | | |
| 6102_U1_C18POS_26_1 | | | 19.368 | | | -0.355 | 0.000 | -0.036 | | | -8.720 | 0.203 | -0.901 | | | -4.139 | 0.046 | -0.503 | | |
| 6102_U2_C18POS_16 | | | 21.448 | | | -0.518 | 0.001 | -0.053 | | | 10.544 | 0.242 | 1.090 | | | 7.046 | 0.108 | 0.856 | | |
| 6102_U3_C18POS_48 | | | 32.995 | | | 0.188 | 0.000 | 0.019 | | | -8.643 | 0.069 | -0.893 | | | 2.545 | 0.006 | 0.309 | | |
| 6102_U4_C18POS_50 | | | 21.415 | | | -2.611 | 0.015 | -0.266 | | | -8.582 | 0.161 | -0.887 | | | 7.651 | 0.128 | 0.929 | | |
| 6103_U1_C18POS_21 | | | 29.393 | | | -9.461 | 0.104 | -0.964 | | | -2.323 | 0.006 | -0.240 | | | 1.185 | 0.002 | 0.144 | | |
| 6103_U2_C18POS_60 | | | 29.874 | | | -9.327 | 0.097 | -0.951 | | | 12.634 | 0.179 | 1.306 | | | 7.989 | 0.072 | 0.970 | | |
# pull PC coordinates into df
PC_coord_nooutliers_noQC_log2 <- as.data.frame(PCA.imp_nooutliers_noQCs_log2$ind$coord)
# bind back metadata from cols 1-11
PC_coord_nooutliers_noQC_log2 <- bind_cols(imp_nooutliers_noQCs_log2[,1:11], PC_coord_nooutliers_noQC_log2)
# grab some variance explained
importance_nooutliers_noQC <- PCA.imp_nooutliers_noQCs_log2$eig
# set variance explained with PC1, round to 2 digits
PC1_nooutliers_noQC <- round(importance_nooutliers_noQC[1,2], 2)
# set variance explained with PC2, round to 2 digits
PC2_nooutliers_noQC <- round(importance_nooutliers_noQC[2,2], 2)Using FactoExtra
# scree plot
fviz_eig(PCA.imp_nooutliers_noQCs_log2)# scores plot
fviz_pca_ind(PCA.imp_nooutliers_noQCs_log2)# plot of contributions from features to PC1
(var_contrib_nooutliers_noQCs_PC1 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 1,
top = 20,
title = "Var contribution to PC1: no outliers, no QCs"))# plot of contributions from features to PC2
(var_contrib_nooutliers_noQCs_PC2 <- fviz_contrib(PCA.imp_nooutliers_noQCs_log2,
choice = "var",
axes = 2,
top = 20,
title = "Var contribution to PC2: no outliers, no QCs"))(PCA_nooutliers_withoutQCs <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = Intervention)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("tomato1", "gold")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "Intervention",
title = "Principal Components Analysis Scores Plot"))ggplotly(PCA_nooutliers_withoutQCs)(PCA_nooutliers_noQCs.prepost <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(pre_post_intervention, levels = c("pre_Yellow",
"post_Yellow",
"pre_Red",
"post_Red")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("gray", "yellow1", "pink1", "red2")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no outliers"))ggplotly(PCA_nooutliers_noQCs.prepost,
tooltip = "text") (PCA_nooutliers_noQCs.MvsF <- PC_coord_nooutliers_noQC_log2 %>%
ggplot(aes(x = Dim.1,
y = Dim.2,
fill = factor(Sex, levels = c("M","F")),
text = sample_ID)) +
geom_point(shape = 21, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha=0.5) +
geom_vline(xintercept = 0, linetype = "dashed", alpha=0.5) +
scale_fill_manual(values = c("green", "pink")) +
scale_color_manual(values = "black") +
theme_minimal() +
coord_fixed(PC2_nooutliers_noQC/PC1_nooutliers_noQC) +
labs(x = glue::glue("PC1: {PC1_nooutliers_noQC}%"),
y = glue::glue("PC2: {PC2_nooutliers_noQC}%"),
fill = "pre_post",
title = "Principal Components Analysis Scores Plot",
subtitle = "Log2 transformed data, no 6106"))ggplotly(PCA_nooutliers_noQCs.MvsF,
tooltip = "text") if (!requireNamespace('BiocManager', quietly = TRUE))
install.packages('BiocManager')
BiocManager::install('PCAtools')library(PCAtools)# create rel abund df suitable for PCAtools package
imp_clust_omicsdata_outliers_forPCAtools <- as.data.frame(t(imp_clust)) # transpose df
names(imp_clust_omicsdata_outliers_forPCAtools) <- imp_clust_omicsdata_outliers_forPCAtools[1,] # make sample IDs column names
imp_clust_omicsdata_outliers_forPCAtools <- imp_clust_omicsdata_outliers_forPCAtools[-1,] # remove sample ID row
# create metadata df suitable for PCAtools pckg
metadata_outliers_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_outliers_forPCAtools <- match(rownames(metadata_outliers_forPCAtools), colnames(imp_clust_omicsdata_outliers_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_outliers_reordered_forPCAtools <- imp_clust_omicsdata_outliers_forPCAtools[ ,order_outliers_forPCAtools]
# change abundance df to numeric
abundata_outliers_reordered_forPCAtools <- abundata_outliers_reordered_forPCAtools %>%
mutate_all(as.numeric)
# Log transform
log2_abundata_outliers_forPCAtools <- log2(abundata_outliers_reordered_forPCAtools)
# unite pre_post column with intervention column to create pre_intervention column
metadata_outliers_forPCAtools <- metadata_outliers_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)# pca
p_outliers <- PCAtools::pca(log2_abundata_outliers_forPCAtools,
metadata = metadata_outliers_forPCAtools,
scale = FALSE # using scaled data already (log2 transformed)
)
# plot
PCAtools::biplot(p_outliers,
showLoadings = TRUE, # show variables that contribute the most to PCs
lab = NULL,
title = ) biplot(p_outliers,
lab = paste0(metadata_outliers_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
# ellipse config
ellipse = TRUE,
ellipseType = 't',
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 1.0,
xlim = c(-100,150), ylim = c(-80, 80),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot with 95% Confidence Interval",
subtitle = "Log2 transformed data, C18 (+), with outliers, no QCs")(PCA.colby.prevspost_outliers <- biplot(p_outliers,
lab = NULL,
# or lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot with Loadings",
subtitle = "Log2 transformed data, C18 (+), without QCs but with outliers",
showLoadings = TRUE))# create rel abund df suitable for PCAtools package
imp_clust_omicsdata_forPCAtools <- as.data.frame(t(imp_clust)) # transpose df
names(imp_clust_omicsdata_forPCAtools) <- imp_clust_omicsdata_forPCAtools[1,] # make sample IDs column names
imp_clust_omicsdata_forPCAtools <- imp_clust_omicsdata_forPCAtools[-1,] # remove sample ID row
imp_clust_omicsdata_forPCAtools <- imp_clust_omicsdata_forPCAtools %>%
dplyr::select(!contains("QC")) # remove QC observations
# create metadata df suitable for PCAtools pckg
metadata_forPCAtools <- metadata %>%
column_to_rownames(var = "sample_ID") # change sample ID column to rownames
# create a vector so that col names in abundance df matches metadata df
order_forPCAtools <- match(rownames(metadata_forPCAtools), colnames(imp_clust_omicsdata_forPCAtools))
# reorder col names in abundance df so that it matches metadata
abundata_reordered_forPCAtools <- imp_clust_omicsdata_forPCAtools[ ,order_forPCAtools]
# change abundance df to numeric
abundata_reordered_forPCAtools <- abundata_reordered_forPCAtools %>%
mutate_all(as.numeric)
# Log transform
log2_abundata_forPCAtools <- log2(abundata_reordered_forPCAtools)
# remove outlier subj from both df
log2_abundata_forPCAtools <- log2_abundata_forPCAtools %>%
dplyr::select(!contains("6106")) %>%
dplyr::select(!contains("6112"))
metadata_forPCAtools <- metadata_forPCAtools %>%
filter(Subject != 6106,
Subject != 6112)
# unite pre_post column with intervention column to create pre_intervention column
metadata_forPCAtools <- metadata_forPCAtools %>%
unite(col = "pre_post_intervention",
c("pre_post","Intervention"),
sep = "_",
remove = FALSE)# pca
p <- PCAtools::pca(log2_abundata_forPCAtools,
metadata = metadata_forPCAtools,
scale = FALSE # using scaled data already (log2 transformed)
)
# plot
PCAtools::biplot(p,
showLoadings = TRUE, # show variables that contribute the most to PCs
lab = NULL,
title = )Horn’s parallel analysis
horn <- parallelPCA(log2_abundata_forPCAtools)
horn$n## [1] 9
Elbow method
elbow <- findElbowPoint(p$variance)
elbow## PC5
## 5
screeplot(p,
components = getComponents(p, 1:20),
vline = c(horn$n, elbow)) +
geom_label(aes(x = horn$n + 1, y = 50,
label = 'Horn\'s', vjust = -1, size = 8)) +
geom_label(aes(x = elbow + 1, y = 50,
label = 'Elbow method', vjust = -3, size = 8))How many PCs do we need to capture at least 80% variance?
which(cumsum(p$variance) > 80)[1]## PC14
## 14
biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
# ellipse config
ellipse = TRUE,
ellipseType = 't', # assumes multivariate t-distribution
ellipseLevel = 0.95,
ellipseFill = TRUE,
ellipseAlpha = 0.2,
ellipseLineSize = 0,
xlim = c(-50,50), ylim = c(-30,25),
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, C18 (+), outliers removed, no QCs \n95% confidence level ellipses")(PCA.colby.prevspost <- biplot(p,
lab = NULL,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "red",
"post_Red" = "red4"),
hline = 0, vline = 0,
legendPosition = 'right',
title = "PCA Scores Plot",
subtitle = "Log2 transformed data, C18 (+), outliers removed, no QCs \n95% confidence level ellipses",
showLoadings = TRUE))(PCA_pairsplot.colby.prevspost <-
pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post_intervention',
colkey = c("pre_Yellow" = "yellow",
"post_Yellow" = "yellow4",
"pre_Red" = "pink",
"post_Red" = "red4"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm')))(PCA.colby.Sex <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.Sex,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Sex',
colkey = c("M" = "red",
"F" = "purple"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.overall.prevspost <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'pre_post',
colkey = c("pre" = "orange",
"post" = "green3"),
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.overall.prevspost,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'pre_post',
colkey = c("pre" = "orange",
"post" = "green3"),
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.period <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'Period',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.period,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'Period',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))(PCA.colby.sequence <- biplot(p,
lab = paste0(metadata_forPCAtools$Subject),
colby = 'sequence',
hline = 0, vline = 0,
legendPosition = 'right' +
geom_point(aes(text = metadata_forPCAtools$Subject))))ggplotly(PCA.colby.sequence,
tooltip = "text") pairsplot(p,
components = getComponents(p, c(1:10)),
triangle = TRUE, trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 0.4,
gridlines.major = FALSE, gridlines.minor = FALSE,
colby = 'sequence',
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))This is a cool way to explore the correlations between the metadata and the PCs! I want to look at how the metavariables correlate with PCs that account for 80% variation in the dataset.
Again: How many PCs do we need to capture at least 80% variance?
which(cumsum(p$variance) > 80)[1]## PC14
## 14
eigencorplot(p,
components = getComponents(p, 1:14), # get components that account for 80% variance
metavars = colnames(metadata_forPCAtools),
col = c('darkblue', 'blue2', 'gray', 'red2', 'darkred'),
cexCorval = 0.7,
colCorval = 'white',
fontCorval = 2,
posLab = 'bottomleft',
rotLabX = 45,
posColKey = 'top',
cexLabColKey = 1.5,
scale = TRUE,
main = 'PC1-14 metadata correlations',
colFrame = 'white',
plotRsquared = FALSE) eigencorplot(p,
components = getComponents(p, 1:14),
metavars = colnames(metadata_forPCAtools),
col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
cexCorval = 1.2,
fontCorval = 2,
posLab = 'all',
rotLabX = 45,
scale = TRUE,
main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ metadata ~ correlates),
plotRsquared = TRUE,
corFUN = 'pearson',
corUSE = 'pairwise.complete.obs',
corMultipleTestCorrection = 'BH',
signifSymbols = c('****', '***', '**', '*', ''),
signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))I am most interested in PCs affected by pre_post_intervention, so I think it would be good to investigate the metabolites that contribute the most to these PCs.
library(mixOmics)Data_forMPCA <- DC_imp_metabind_clust_log2_noQCs %>%
mutate_at("Subject", as.factor)
summary(as.factor(Data_forMPCA$Subject))## 6101 6102 6103 6104 6105 6106 6108 6109 6110 6111 6112 6113
## 4 4 4 4 4 4 4 4 4 4 4 4
# make a vector for meta variables
(metavar <- Data_forMPCA[,c(1:11)] %>%
colnames())## [1] "sample_ID" "Subject" "Period"
## [4] "pre_post_intervention" "Intervention" "pre_post"
## [7] "sequence" "Intervention_week" "Sex"
## [10] "Age" "BMI"
mixOmicsPCA.result <- mixOmics::pca(Data_forMPCA[,!names(Data_forMPCA) %in% metavar],
scale = FALSE,
center = FALSE)
plotIndiv(mixOmicsPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post_intervention,
legend = TRUE,
legend.title = "Treatment",
title = 'Regular PCA, C18 (+), Log2 transformed')With all data
multilevelPCA.result <- mixOmics::pca(Data_forMPCA[,-(c(1:11))],
multilevel = Data_forMPCA$Subject,
scale = FALSE,
center = FALSE)
plotIndiv(multilevelPCA.result,
ind.names = Data_forMPCA$Subject,
group = Data_forMPCA$pre_post_intervention,
legend = TRUE,
legend.title = "Treatment",
title = 'Multilevel PCA, C18 (+), Log2 transformed')plotLoadings(multilevelPCA.result, ndisplay = 75)# tidy df
DC_imp_metabind_clust_tidy_log2 <- DC_imp_metabind_clust_log2 %>%
pivot_longer(cols = 12:ncol(.),
names_to = "mz_rt",
values_to = "rel_abund_log2")
# use tidy data
head(DC_imp_metabind_clust_tidy_log2)## # A tibble: 6 × 13
## sample_ID Subject Period pre_post_intervention Intervention pre_post sequence
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 2 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 3 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 4 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 5 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## 6 6101_U1_C… 6101 U1 pre_Red Red pre R_Y
## # ℹ 6 more variables: Intervention_week <chr>, Sex <chr>, Age <chr>, BMI <chr>,
## # mz_rt <chr>, rel_abund_log2 <dbl>
# remove QCs
df_for_stats <- DC_imp_metabind_clust_tidy_log2 %>%
filter(Intervention != "QC")
# check if QCs were removed
unique(df_for_stats$Intervention)## [1] "Red" "Yellow"
# df without outliers
df_for_stats_noOutlier <- df_for_stats %>%
filter(Subject != "6106",
Subject != "6112")
# check if outlier was removed
unique(df_for_stats_noOutlier$Subject)## [1] "6101" "6102" "6103" "6104" "6105" "6108" "6109" "6110" "6111" "6113"
# turn off sci notation outputs
options(scipen = 999)Here, I am comparing pre- to post-intervention for both yellow and tomato soy (Red) juice interventions. I also compare post-yellow to post-red intervention. I am using the log transformed values of rel abundance since parametric tests assume normality.
# run paired t-tests for control intervention
ctrl_t.test_paired <- df_for_stats %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_t.test_paired_sig <- ctrl_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_t.test_paired_sig)## # A tibble: 148 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 100.0756_0.… rel_… post pre 12 12 -2.33 11 4.02e-2 *
## 2 111.044_2.9… rel_… post pre 12 12 -2.40 11 3.55e-2 *
## 3 114.0669_0.… rel_… post pre 12 12 -2.43 11 3.35e-2 *
## 4 132.0768_0.… rel_… post pre 12 12 -3.63 11 3.98e-3 **
## 5 136.0482_0.… rel_… post pre 12 12 3.28 11 7.3 e-3 **
## 6 141.0659_0.… rel_… post pre 12 12 -2.50 11 2.92e-2 *
## 7 142.0862_0.… rel_… post pre 12 12 -3.26 11 7.57e-3 **
## 8 149.0598_7.… rel_… post pre 12 12 2.56 11 2.65e-2 *
## 9 156.0769_0.… rel_… post pre 12 12 -4.69 11 6.63e-4 ***
## 10 159.1492_0.… rel_… post pre 12 12 -2.65 11 2.24e-2 *
## # ℹ 138 more rows
# how many are significant?
nrow(ctrl_t.test_paired_sig)## [1] 148
# run paired t-tests for control intervention
ctrl_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Yellow") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
ctrl_noOutlier_t.test_paired_sig <- ctrl_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(ctrl_noOutlier_t.test_paired_sig)## # A tibble: 98 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 132.0768_0.… rel_… post pre 10 10 -3.15 9 0.0117 *
## 2 136.0482_0.… rel_… post pre 10 10 2.58 9 0.0296 *
## 3 142.0862_0.… rel_… post pre 10 10 -2.82 9 0.02 *
## 4 144.1283_0.… rel_… post pre 10 10 2.33 9 0.0448 *
## 5 156.0769_0.… rel_… post pre 10 10 -4.13 9 0.00255 **
## 6 162.1126_0.… rel_… post pre 10 10 -3.12 9 0.0124 *
## 7 163.1243_0.… rel_… post pre 10 10 -3.98 9 0.00322 **
## 8 166.0862_0.… rel_… post pre 10 10 -3.49 9 0.0068 **
## 9 170.0926_0.… rel_… post pre 10 10 -2.98 9 0.0155 *
## 10 174.1237_2.… rel_… post pre 10 10 4.06 9 0.00283 **
## # ℹ 88 more rows
# how many are significant?
nrow(ctrl_noOutlier_t.test_paired_sig)## [1] 98
# run paired t-tests for control intervention
red_t.test_paired <- df_for_stats %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_t.test_paired_sig <- red_t.test_paired %>%
filter(p <= 0.05)
tibble(red_t.test_paired_sig)## # A tibble: 79 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 111.044_2.9… rel_… post pre 12 12 -2.27 11 0.0439 *
## 2 121.0284_3.… rel_… post pre 12 12 3.24 11 0.00789 **
## 3 135.0441_3.… rel_… post pre 12 12 -3.16 11 0.00908 **
## 4 144.1023_0.… rel_… post pre 12 12 -3.21 11 0.00824 **
## 5 145.0648_5.… rel_… post pre 12 12 -3.31 11 0.0069 **
## 6 160.0967_0.… rel_… post pre 12 12 -3.80 11 0.00295 **
## 7 166.0863_2.… rel_… post pre 12 12 -2.44 11 0.0328 *
## 8 170.0447_0.… rel_… post pre 12 12 2.70 11 0.0207 *
## 9 170.0448_2.… rel_… post pre 12 12 2.74 11 0.0194 *
## 10 190.0499_3.… rel_… post pre 12 12 -2.73 11 0.0196 *
## # ℹ 69 more rows
# how many are significant?
nrow(red_t.test_paired_sig)## [1] 79
# run paired t-tests for control intervention
red_noOutlier_t.test_paired <- df_for_stats_noOutlier %>%
filter(Intervention == "Red") %>%
dplyr::select(Subject, pre_post, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ pre_post,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
red_noOutlier_t.test_paired_sig <- red_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(red_noOutlier_t.test_paired_sig)## # A tibble: 69 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 121.0284_3.… rel_… post pre 10 10 3.51 9 0.00665 **
## 2 121.0646_4.… rel_… post pre 10 10 3.45 9 0.0073 **
## 3 135.0441_3.… rel_… post pre 10 10 -2.76 9 0.022 *
## 4 138.0551_0.… rel_… post pre 10 10 2.76 9 0.0223 *
## 5 144.1023_0.… rel_… post pre 10 10 -2.95 9 0.0162 *
## 6 145.0648_5.… rel_… post pre 10 10 -2.66 9 0.0259 *
## 7 160.0967_0.… rel_… post pre 10 10 -3.76 9 0.0045 **
## 8 166.0863_2.… rel_… post pre 10 10 -2.29 9 0.0481 *
## 9 190.0499_3.… rel_… post pre 10 10 -2.37 9 0.0417 *
## 10 196.0604_3.… rel_… post pre 10 10 3.25 9 0.00993 **
## # ℹ 59 more rows
# how many are significant?
nrow(red_noOutlier_t.test_paired_sig)## [1] 69
# run paired t-tests for post interventions
post_t.test_paired <- df_for_stats %>%
filter(pre_post == "post") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_t.test_paired_sig <- post_t.test_paired %>%
filter(p <= 0.05)
tibble(post_t.test_paired_sig)## # A tibble: 28 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 203.139_0.6… rel_… Red Yellow 12 12 2.33 11 3.99e-2 *
## 2 234.1122_4.… rel_… Red Yellow 12 12 -6.51 11 4.38e-5 ****
## 3 250.1107_3.… rel_… Red Yellow 12 12 -4.16 11 1.59e-3 **
## 4 255.0655_5.… rel_… Red Yellow 12 12 10.2 11 5.95e-7 ****
## 5 271.0596_4.… rel_… Red Yellow 12 12 13.3 11 4.1 e-8 ****
## 6 271.1656_4.… rel_… Red Yellow 12 12 2.33 11 3.98e-2 *
## 7 274.1833_5.… rel_… Red Yellow 12 12 2.36 11 3.8 e-2 *
## 8 277.1432_7.… rel_… Red Yellow 12 12 3.90 11 2.47e-3 **
## 9 292.1575_3.… rel_… Red Yellow 12 12 -2.25 11 4.57e-2 *
## 10 302.1959_0.… rel_… Red Yellow 12 12 2.71 11 2.02e-2 *
## # ℹ 18 more rows
# how many are significant?
nrow(post_t.test_paired_sig)## [1] 28
# run paired t-tests for post interventions
post_noOutlier_t.test_paired <- df_for_stats %>%
filter(pre_post == "post",
Subject != "6106") %>%
dplyr::select(Subject, Intervention, mz_rt, rel_abund_log2) %>%
group_by(mz_rt) %>%
t_test(rel_abund_log2 ~ Intervention,
paired = TRUE,
p.adjust.method = "BH") %>% # Benjamini-Hochberg controlling to lower false positives
add_significance()Statistically significant features
# which features are significant?
post_noOutlier_t.test_paired_sig <- post_noOutlier_t.test_paired %>%
filter(p <= 0.05)
tibble(post_noOutlier_t.test_paired_sig)## # A tibble: 32 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 142.0862_0.… rel_… Red Yellow 11 11 2.56 10 2.83e-2 *
## 2 159.1492_0.… rel_… Red Yellow 11 11 2.98 10 1.39e-2 *
## 3 160.0967_0.… rel_… Red Yellow 11 11 -2.24 10 4.86e-2 *
## 4 219.1705_0.… rel_… Red Yellow 11 11 -2.36 10 3.97e-2 *
## 5 230.9902_0.… rel_… Red Yellow 11 11 -2.56 10 2.86e-2 *
## 6 234.1122_4.… rel_… Red Yellow 11 11 -4.55 10 1.05e-3 **
## 7 250.1107_3.… rel_… Red Yellow 11 11 -2.82 10 1.8 e-2 *
## 8 255.0655_5.… rel_… Red Yellow 11 11 9.28 10 3.15e-6 ****
## 9 265.1278_3.… rel_… Red Yellow 11 11 2.28 10 4.56e-2 *
## 10 271.0596_4.… rel_… Red Yellow 11 11 11.6 10 4.08e-7 ****
## # ℹ 22 more rows
# how many are significant?
nrow(post_noOutlier_t.test_paired_sig)## [1] 32
Are there any significant features shared between tests with and without outlier?
post_sig_outlier_comp <- list(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig) %>%
reduce(inner_join, by = "mz_rt")
tibble(post_sig_outlier_comp)## # A tibble: 22 × 19
## mz_rt .y..x group1.x group2.x n1.x n2.x statistic.x df.x p.x
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 234.1122_4.883 rel_a… Red Yellow 11 11 -4.55 10 1.05e-3
## 2 250.1107_3.546 rel_a… Red Yellow 11 11 -2.82 10 1.8 e-2
## 3 255.0655_5.039 rel_a… Red Yellow 11 11 9.28 10 3.15e-6
## 4 271.0596_4.449 rel_a… Red Yellow 11 11 11.6 10 4.08e-7
## 5 271.1656_4.57 rel_a… Red Yellow 11 11 2.24 10 4.88e-2
## 6 277.1432_7.079 rel_a… Red Yellow 11 11 3.25 10 8.73e-3
## 7 302.1959_0.798 rel_a… Red Yellow 11 11 2.92 10 1.53e-2
## 8 316.1385_4.504 rel_a… Red Yellow 11 11 11.4 10 4.61e-7
## 9 385.1464_0.793 rel_a… Red Yellow 11 11 -2.36 10 4.02e-2
## 10 394.1701_3.105 rel_a… Red Yellow 11 11 -2.24 10 4.87e-2
## # ℹ 12 more rows
## # ℹ 10 more variables: p.signif.x <chr>, .y..y <chr>, group1.y <chr>,
## # group2.y <chr>, n1.y <int>, n2.y <int>, statistic.y <dbl>, df.y <dbl>,
## # p.y <dbl>, p.signif.y <chr>
# how many sig features are shared between df vs df w/o outliers
nrow(post_sig_outlier_comp)## [1] 22
# return sig features present only in df with outlier, and not in df without outlier
tibble(anti_join(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig,
by = "mz_rt"))## # A tibble: 10 × 10
## mz_rt .y. group1 group2 n1 n2 statistic df p p.signif
## <chr> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 142.0862_0.… rel_… Red Yellow 11 11 2.56 10 0.0283 *
## 2 159.1492_0.… rel_… Red Yellow 11 11 2.98 10 0.0139 *
## 3 160.0967_0.… rel_… Red Yellow 11 11 -2.24 10 0.0486 *
## 4 219.1705_0.… rel_… Red Yellow 11 11 -2.36 10 0.0397 *
## 5 230.9902_0.… rel_… Red Yellow 11 11 -2.56 10 0.0286 *
## 6 265.1278_3.… rel_… Red Yellow 11 11 2.28 10 0.0456 *
## 7 303.0862_4.… rel_… Red Yellow 11 11 -2.36 10 0.0399 *
## 8 330.2275_4.… rel_… Red Yellow 11 11 3.42 10 0.0065 **
## 9 357.1179_3.… rel_… Red Yellow 11 11 -3.28 10 0.00835 **
## 10 369.1289_3.… rel_… Red Yellow 11 11 2.26 10 0.0477 *
# return sig features from df without outlier that are also present in df with outlier
kable(semi_join(post_noOutlier_t.test_paired_sig,
post_t.test_paired_sig,
by = "mz_rt"))| mz_rt | .y. | group1 | group2 | n1 | n2 | statistic | df | p | p.signif |
|---|---|---|---|---|---|---|---|---|---|
| 234.1122_4.883 | rel_abund_log2 | Red | Yellow | 11 | 11 | -4.552660 | 10 | 0.0010500 | ** |
| 250.1107_3.546 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.823864 | 10 | 0.0180000 | * |
| 255.0655_5.039 | rel_abund_log2 | Red | Yellow | 11 | 11 | 9.275176 | 10 | 0.0000032 | **** |
| 271.0596_4.449 | rel_abund_log2 | Red | Yellow | 11 | 11 | 11.580094 | 10 | 0.0000004 | **** |
| 271.1656_4.57 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.242981 | 10 | 0.0488000 | * |
| 277.1432_7.079 | rel_abund_log2 | Red | Yellow | 11 | 11 | 3.249540 | 10 | 0.0087300 | ** |
| 302.1959_0.798 | rel_abund_log2 | Red | Yellow | 11 | 11 | 2.921904 | 10 | 0.0153000 | * |
| 316.1385_4.504 | rel_abund_log2 | Red | Yellow | 11 | 11 | 11.428899 | 10 | 0.0000005 | **** |
| 385.1464_0.793 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.356238 | 10 | 0.0402000 | * |
| 394.1701_3.105 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.243519 | 10 | 0.0487000 | * |
| 429.2112_4.041 | rel_abund_log2 | Red | Yellow | 11 | 11 | -2.626168 | 10 | 0.0253000 | * |
| 431.0964_3.273 | rel_abund_log2 | Red | Yellow | 11 | 11 | 12.731899 | 10 | 0.0000002 | **** |
| 431.0966_3.757 | rel_abund_log2 | Red | Yellow | 11 | 11 | 14.036215 | 10 | 0.0000001 | **** |
| 431.0972_4.035 | rel_abund_log2 | Red | Yellow | 11 | 11 | 16.113895 | 10 | 0.0000000 | **** |
| 433.1121_3.884 | rel_abund_log2 | Red | Yellow | 11 | 11 | 11.045530 | 10 | 0.0000006 | **** |
| 433.1123_3.32 | rel_abund_log2 | Red | Yellow | 11 | 11 | 12.445589 | 10 | 0.0000002 | **** |
| 435.1282_4.891 | rel_abund_log2 | Red | Yellow | 11 | 11 | 17.827128 | 10 | 0.0000000 | **** |
| 447.0916_4.229 | rel_abund_log2 | Red | Yellow | 11 | 11 | 11.638891 | 10 | 0.0000004 | **** |
| 449.1046_3.633 | rel_abund_log2 | Red | Yellow | 11 | 11 | 3.864036 | 10 | 0.0031400 | ** |
| 449.1059_4.17 | rel_abund_log2 | Red | Yellow | 11 | 11 | 3.176541 | 10 | 0.0098800 | ** |
| 461.1074_3.814 | rel_abund_log2 | Red | Yellow | 11 | 11 | 19.029358 | 10 | 0.0000000 | **** |
| 461.1078_3.304 | rel_abund_log2 | Red | Yellow | 11 | 11 | 18.846925 | 10 | 0.0000000 | **** |
# make a column with regular abundances (not log transformed)
df_for_stats <- df_for_stats %>%
mutate(rel_abund = 2^(rel_abund_log2))
# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_v_yellow_volcano_data <- df_for_stats %>%
filter(pre_post == "post") %>%
group_by(Intervention, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = Intervention, values_from = mean_rel_abund) %>%
mutate(FC_postRed_div_postYellow = Red/Yellow)
# bind back pval
red_v_yellow_tobind <- post_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_v_yellow_volcano_data <-
bind_cols(red_v_yellow_volcano_data, red_v_yellow_tobind) %>%
mutate(log2_FC_postRed_div_postYellow = if_else(FC_postRed_div_postYellow > 0,
log2(FC_postRed_div_postYellow),
-(log2(abs(FC_postRed_div_postYellow)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. I will choose 8. At this point I don't have any absent features so the data shouldn't change.
red_v_yellow_volcano_data <- red_v_yellow_volcano_data %>%
mutate(log2_FC_postRed_div_postYellow = if_else(is.infinite(log2_FC_postRed_div_postYellow),
8, log2_FC_postRed_div_postYellow))
# create a df of features passing FC and pval cutoffs higher in post-Red
higher_postRed <- red_v_yellow_volcano_data %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow >= 1)
# create a df of features passing FC and pval cutoffs higher in post-control
higher_postYellow <- red_v_yellow_volcano_data %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow <= -1) write_csv(higher_postRed,
"intervention-comp-sig-RED-C18Pos-05Jun23.csv")
write_csv(higher_postYellow,
"intervention-comp-sig-YELLOW-C18Pos-05Jun23.csv")(red_v_yellow_volcano <- red_v_yellow_volcano_data %>%
ggplot(aes(x = log2_FC_postRed_div_postYellow, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change tomato/control: {round(FC_postRed_div_postYellow, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = higher_postRed,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "tomato") +
geom_point(data = higher_postYellow,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy and Control Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption while yellow points are higher \nafter control tomato juice consumption",
caption = "Vertical dashed lines represent a log fold change >1 or < -1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (TomatoSoy/Control)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_v_yellow_volcano_ggplotly <- ggplotly(red_v_yellow_volcano, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_v_yellow_volcano,
filename = "red_v_yellow_volcano.svg")
# save interactive volcano plot
saveWidget(widget = red_v_yellow_volcano_ggplotly,
file = "interactive_redvyellow_volcano_plot.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_v_yel_volcano_data_noOutlier <- df_for_stats %>%
filter(pre_post == "post",
Subject != 6106,
Subject != 6112) %>% # remove outlier subj
group_by(Intervention, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = Intervention, values_from = mean_rel_abund) %>%
mutate(FC_postRed_div_postYellow = Red/Yellow)
# bind back pval
red_v_yel_tobind_noOutlier <- post_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_v_yel_volcano_data_noOutlier <-
bind_cols(red_v_yel_volcano_data_noOutlier, red_v_yel_tobind_noOutlier) %>%
mutate(log2_FC_postRed_div_postYellow = if_else(FC_postRed_div_postYellow > 0,
log2(FC_postRed_div_postYellow),
-(log2(abs(FC_postRed_div_postYellow)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. I will choose 8. At this point I don't have any absent features so the data shouldn't change.
red_v_yel_volcano_data_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
mutate(log2_FC_postRed_div_postYellow = if_else(is.infinite(log2_FC_postRed_div_postYellow),
8, log2_FC_postRed_div_postYellow))
# create a df of features passing FC and pval cutoffs higher in post-Red
higher_postRed_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow >= 1)
# create a df of features passing FC and pval cutoffs higher in post-control
higher_postYellow_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_postRed_div_postYellow <= -1) write_csv(higher_postRed_noOutlier,
"intervention-comp-sig-RED-nooutliers-C18Pos-15Jun23.csv")
write_csv(higher_postYellow_noOutlier,
"intervention-comp-sig-YELLOW-nooutliers-C18Pos-15Jun23.csv")(red_v_yellow_volcano_noOutlier <- red_v_yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_postRed_div_postYellow, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change tomato/control: {round(FC_postRed_div_postYellow, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = higher_postRed_noOutlier,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "tomato") +
geom_point(data = higher_postYellow_noOutlier,
aes(x = log2_FC_postRed_div_postYellow, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_vline(xintercept = -1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy and Control Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption while yellow points are higher \nafter control tomato juice consumption. Subject 6106 removed",
caption = "Vertical dashed lines represent a log fold change >1 or < -1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (TomatoSoy/Control)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_v_yellow_volcano_ggplotly_noOutlier <- ggplotly(red_v_yellow_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_v_yellow_volcano_noOutlier,
filename = "red_v_yellow_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = red_v_yellow_volcano_ggplotly_noOutlier,
file = "interactive_redvyellow_volcano_plot_noOutlier.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_volcano_data <- df_for_stats %>%
filter(Intervention == "Red") %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
red_tobind <- red_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_volcano_data <-
bind_cols(red_volcano_data, red_tobind) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#red_volcano_data <- red_volcano_data %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
red_higher_post <- red_volcano_data %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)Save file of features that pass FC and pvalue cutoffs
write_csv(red_higher_post,
"pre-post-sig-RED-C18Pos-15Jun23.csv")(red_volcano <- red_volcano_data %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = red_higher_post,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "tomato") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption when compared to prior to consumption",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_volcano_ggplotly <- ggplotly(red_volcano, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_volcano,
filename = "red_volcano.svg")
# save interactive volcano plot
saveWidget(widget = red_volcano_ggplotly,
file = "interactive_red_volcano_plot.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
red_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Red",
Subject != 6106,
Subject != 6112) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
red_tobind_noOutlier <- red_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
red_volcano_data_noOutlier <-
bind_cols(red_volcano_data_noOutlier, red_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#red_volcano_data_noOutlier <- red_volcano_data_noOutlier %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
red_higher_post_noOutlier <- red_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)write_csv(red_higher_post_noOutlier,
"pre-post-sig-RED-nooutliers-C18Pos-15Jun23.csv")(red_volcano_noOutlier <- red_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = red_higher_post_noOutlier,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "tomato") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Tomato-Soy Juice Consumption",
subtitle = "Red points are higher after tomato-soy juice consumption when compared to prior to consumption",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(red_volcano_ggplotly_noOutlier <- ggplotly(red_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = red_volcano_noOutlier,
filename = "red_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = red_volcano_ggplotly_noOutlier,
file = "interactive_red_volcano_plot_noOutlier.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
yel_volcano_data <- df_for_stats %>%
filter(Intervention == "Yellow") %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
yel_tobind <- ctrl_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
yel_volcano_data <-
bind_cols(yel_volcano_data, yel_tobind) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#yel_volcano_data <- yel_volcano_data %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
yellow_higher_post <- yel_volcano_data %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)write_csv(yellow_higher_post,
"pre-post-sig-YELLOW-C18Pos-15Jun23.csv")(yellow_volcano <- yel_volcano_data %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = yellow_higher_post,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Control, Yellow Tomato Juice Consumption",
subtitle = "Yellow points are higher after control juice consumption when compared to prior to consumption",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(yellow_volcanoo_ggplotly <- ggplotly(yellow_volcano, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = yellow_volcano,
filename = "yellow_volcano.svg")
# save interactive volcano plot
saveWidget(widget = red_volcano_ggplotly,
file = "interactive_red_volcano_plot.html")# calculate mean rel abund (not log) by sample, and avg fold change (FC) diff by feature
yel_volcano_data_noOutlier <- df_for_stats %>%
filter(Intervention == "Yellow",
Subject != 6106,
Subject != 6106) %>%
group_by(pre_post, mz_rt) %>%
summarize(mean_rel_abund = mean(rel_abund)) %>%
pivot_wider(names_from = pre_post, values_from = mean_rel_abund) %>%
mutate(FC_post_div_pre = post/pre)
# bind back pval
yel_tobind_noOutlier <- ctrl_noOutlier_t.test_paired %>%
dplyr::select(p)
# calculate log2FC, and -log10p
yel_volcano_data_noOutlier <-
bind_cols(yel_volcano_data_noOutlier, yel_tobind_noOutlier) %>%
mutate(log2_FC_post_div_pre = if_else(FC_post_div_pre > 0,
log2(FC_post_div_pre),
-(log2(abs(FC_post_div_pre)))),
neglog10p = -log10(p))
# set FC for features present in post-red but absent in post-yellow to a constant. Only use if there are 0s in data
#yel_volcano_data_noOutlier <- yel_volcano_data_noOutlier %>%
#mutate(log2_FC_post_div_pre = if_else(is.infinite(log2_FC_post_div_pre),
#8, log2_FC_post_div_pre))
# create a df of features passing FC and pval cutoffs higher in post-intervention compared to pre
yel_higher_post_noOutlier <- yel_volcano_data_noOutlier %>%
filter(p <= 0.05 & log2_FC_post_div_pre >= 1)write_csv(yel_higher_post_noOutlier,
"pre-post-sig-YELLOW-nooutliers-C18Pos-15Jun23.csv")(yel_volcano_noOutlier <- yel_volcano_data_noOutlier %>%
ggplot(aes(x = log2_FC_post_div_pre, y = neglog10p,
text = glue("Mass_retention time: {mz_rt}
P-value: {p}
Fold change post/pre: {round(FC_post_div_pre, 2)}"))) +
geom_point(color = "grey") +
geom_point(data = yel_higher_post_noOutlier,
aes(x = log2_FC_post_div_pre, y = neglog10p),
color = "yellow2") +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey") +
geom_hline(yintercept = 1.3, linetype = "dashed", color = "grey") +
coord_cartesian(xlim = c(-5, 8)) +
labs(title = "Volcano Plot of Features Different in People After Control, Yellow Tomato Juice Consumption",
subtitle = "Yellow points are higher after control juice consumption when compared to prior to consumption.\nSubject 6106 removed",
caption = "Vertical dashed line represents a log fold change >1, and horizontal dashed line represents an FDR corrected \np-value of 0.05.",
x = "Log2 Fold Change (Post/Pre)",
y = "-Log10(P-value)") +
theme_bw() +
theme(plot.title = element_text(size = 12, hjust = 0),
plot.caption = element_text(hjust = 0.5)))(yel_volcano_ggplotly_noOutlier <- ggplotly(yel_volcano_noOutlier, tooltip = "text"))Save plots
# save volcano plot
ggsave(plot = yel_volcano_noOutlier,
filename = "yel_volcano_noOutlier.svg")
# save interactive volcano plot
saveWidget(widget = yel_volcano_ggplotly_noOutlier,
file = "interactive_yel_volcano_plot_noOutlier.html")